From bb41c219f6449a2530d7db830be551d58da485ed Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 16:58:02 -0300 Subject: [PATCH 001/416] feat: add Ariadne service and glue scheduling --- scripts/dashboards_render_atlas.py | 37 ++++ .../vaultwarden-cred-sync-cronjob.yaml | 1 + services/finance/portal-rbac.yaml | 14 ++ services/health/portal-rbac.yaml | 16 +- .../health/wger-admin-ensure-cronjob.yaml | 1 + services/keycloak/realm-settings-job.yaml | 2 + services/mailu/mailu-sync-cronjob.yaml | 1 + services/maintenance/ariadne-deployment.yaml | 181 ++++++++++++++++++ services/maintenance/ariadne-service.yaml | 13 ++ .../maintenance/ariadne-serviceaccount.yaml | 8 + services/maintenance/kustomization.yaml | 6 + services/maintenance/secretproviderclass.yaml | 21 ++ .../maintenance/vault-serviceaccount.yaml | 6 + .../maintenance/vault-sync-deployment.yaml | 34 ++++ .../monitoring/dashboards/atlas-testing.json | 150 +++++++++++++++ .../monitoring/grafana-dashboard-testing.yaml | 150 +++++++++++++++ services/nextcloud-mail-sync/cronjob.yaml | 1 + services/nextcloud-mail-sync/portal-rbac.yaml | 13 ++ .../vault/scripts/vault_k8s_auth_configure.sh | 2 + services/vaultwarden/ariadne-rbac.yaml | 28 +++ services/vaultwarden/kustomization.yaml | 1 + 21 files changed, 685 insertions(+), 1 deletion(-) create mode 100644 services/maintenance/ariadne-deployment.yaml create mode 100644 services/maintenance/ariadne-service.yaml create mode 100644 services/maintenance/ariadne-serviceaccount.yaml create mode 100644 services/maintenance/secretproviderclass.yaml create mode 100644 services/maintenance/vault-serviceaccount.yaml create mode 100644 services/maintenance/vault-sync-deployment.yaml create mode 100644 services/vaultwarden/ariadne-rbac.yaml diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 0931b48..116bf21 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -336,6 +336,10 @@ GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPE GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" +ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' +ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' +ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" +ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -2230,6 +2234,39 @@ def build_testing_dashboard(): instant=True, ) ) + panels.append( + table_panel( + 7, + "Ariadne Task Errors (24h)", + ARIADNE_TASK_ERRORS_24H, + {"h": 6, "w": 12, "x": 0, "y": 12}, + unit="none", + transformations=sort_desc, + instant=True, + ) + ) + panels.append( + table_panel( + 8, + "Ariadne Schedule Last Success (hours ago)", + ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, + {"h": 6, "w": 12, "x": 12, "y": 12}, + unit="h", + transformations=sort_desc, + instant=True, + ) + ) + panels.append( + table_panel( + 9, + "Ariadne Access Requests", + ARIADNE_ACCESS_REQUESTS, + {"h": 4, "w": 24, "x": 0, "y": 18}, + unit="none", + transformations=sort_desc, + instant=True, + ) + ) return { "uid": "atlas-testing", diff --git a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml index 29141fe..acd851b 100644 --- a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml +++ b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/finance/portal-rbac.yaml b/services/finance/portal-rbac.yaml index 2fb7ede..66eafea 100644 --- a/services/finance/portal-rbac.yaml +++ b/services/finance/portal-rbac.yaml @@ -29,3 +29,17 @@ subjects: - kind: ServiceAccount name: bstein-dev-home namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-firefly-user-sync + namespace: finance +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-firefly-user-sync +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/health/portal-rbac.yaml b/services/health/portal-rbac.yaml index cd9acd1..feb7441 100644 --- a/services/health/portal-rbac.yaml +++ b/services/health/portal-rbac.yaml @@ -8,7 +8,7 @@ rules: - apiGroups: ["batch"] resources: ["cronjobs"] verbs: ["get"] - resourceNames: ["wger-user-sync"] + resourceNames: ["wger-user-sync", "wger-admin-ensure"] - apiGroups: ["batch"] resources: ["jobs"] verbs: ["create", "get", "list", "watch"] @@ -29,3 +29,17 @@ subjects: - kind: ServiceAccount name: bstein-dev-home namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-wger-user-sync + namespace: health +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-wger-user-sync +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/health/wger-admin-ensure-cronjob.yaml b/services/health/wger-admin-ensure-cronjob.yaml index db178a3..a1063dd 100644 --- a/services/health/wger-admin-ensure-cronjob.yaml +++ b/services/health/wger-admin-ensure-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "15 3 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index f680200..a0b36ec 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -331,6 +331,8 @@ spec: # Ensure basic realm groups exist for provisioning. ensure_group("dev") ensure_group("admin") + ensure_group("demo") + ensure_group("test") planka_group = ensure_group("planka-users") if planka_group and planka_group.get("id"): diff --git a/services/mailu/mailu-sync-cronjob.yaml b/services/mailu/mailu-sync-cronjob.yaml index 1da1981..671439d 100644 --- a/services/mailu/mailu-sync-cronjob.yaml +++ b/services/mailu/mailu-sync-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "30 4 * * *" + suspend: true concurrencyPolicy: Forbid jobTemplate: spec: diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml new file mode 100644 index 0000000..fd2fb79 --- /dev/null +++ b/services/maintenance/ariadne-deployment.yaml @@ -0,0 +1,181 @@ +# services/maintenance/ariadne-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ariadne + namespace: maintenance +spec: + replicas: 1 + revisionHistoryLimit: 3 + selector: + matchLabels: + app: ariadne + template: + metadata: + labels: + app: ariadne + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: "maintenance" + vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/portal/atlas-portal-db" + vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | + {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" + {{ end }} + {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }} + export KEYCLOAK_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" + {{ end }} + {{ with secret "kv/data/atlas/mailu/mailu-db-secret" }} + export MAILU_DB_NAME="{{ .Data.data.database }}" + export MAILU_DB_USER="{{ .Data.data.username }}" + export MAILU_DB_PASSWORD="{{ .Data.data.password }}" + {{ end }} + {{ with secret "kv/data/atlas/mailu/mailu-initial-account-secret" }} + export SMTP_HOST="mailu-front.mailu-mailserver.svc.cluster.local" + export SMTP_PORT="587" + export SMTP_STARTTLS="true" + export SMTP_USE_TLS="false" + export SMTP_USERNAME="no-reply-portal@bstein.dev" + export SMTP_PASSWORD="{{ .Data.data.password }}" + export SMTP_FROM="no-reply-portal@bstein.dev" + {{ end }} + spec: + serviceAccountName: ariadne + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: ariadne + image: registry.bstein.dev/bstein/ariadne:0.1.0 + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - >- + . /vault/secrets/ariadne-env.sh + && exec uvicorn ariadne.app:app --host 0.0.0.0 --port 8080 + ports: + - name: http + containerPort: 8080 + env: + - name: KEYCLOAK_URL + value: https://sso.bstein.dev + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_CLIENT_ID + value: bstein-dev-home + - name: KEYCLOAK_ISSUER + value: https://sso.bstein.dev/realms/atlas + - name: KEYCLOAK_JWKS_URL + value: http://keycloak.sso.svc.cluster.local/realms/atlas/protocol/openid-connect/certs + - name: KEYCLOAK_ADMIN_URL + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_ADMIN_REALM + value: atlas + - name: KEYCLOAK_ADMIN_CLIENT_ID + value: bstein-dev-home-admin + - name: PORTAL_PUBLIC_BASE_URL + value: https://bstein.dev + - name: PORTAL_ADMIN_USERS + value: bstein + - name: PORTAL_ADMIN_GROUPS + value: admin + - name: ACCOUNT_ALLOWED_GROUPS + value: dev,admin + - name: ALLOWED_FLAG_GROUPS + value: demo,test + - name: DEFAULT_USER_GROUPS + value: dev + - name: MAILU_DOMAIN + value: bstein.dev + - name: MAILU_SYNC_URL + value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events + - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC + value: "60" + - name: MAILU_DB_HOST + value: postgres-service.postgres.svc.cluster.local + - name: MAILU_DB_PORT + value: "5432" + - name: NEXTCLOUD_NAMESPACE + value: nextcloud + - name: NEXTCLOUD_MAIL_SYNC_CRONJOB + value: nextcloud-mail-sync + - name: NEXTCLOUD_MAIL_SYNC_WAIT_TIMEOUT_SEC + value: "90" + - name: NEXTCLOUD_MAIL_SYNC_JOB_TTL_SEC + value: "3600" + - name: WGER_NAMESPACE + value: health + - name: WGER_USER_SYNC_CRONJOB + value: wger-user-sync + - name: WGER_ADMIN_CRONJOB + value: wger-admin-ensure + - name: WGER_USER_SYNC_WAIT_TIMEOUT_SEC + value: "90" + - name: FIREFLY_NAMESPACE + value: finance + - name: FIREFLY_USER_SYNC_CRONJOB + value: firefly-user-sync + - name: FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC + value: "90" + - name: VAULTWARDEN_NAMESPACE + value: vaultwarden + - name: VAULTWARDEN_POD_LABEL + value: app=vaultwarden + - name: VAULTWARDEN_POD_PORT + value: "80" + - name: VAULTWARDEN_SERVICE_HOST + value: vaultwarden-service.vaultwarden.svc.cluster.local + - name: VAULTWARDEN_ADMIN_SECRET_NAME + value: vaultwarden-admin + - name: VAULTWARDEN_ADMIN_SECRET_KEY + value: ADMIN_TOKEN + - name: VAULTWARDEN_ADMIN_SESSION_TTL_SEC + value: "900" + - name: VAULTWARDEN_ADMIN_RATE_LIMIT_BACKOFF_SEC + value: "600" + - name: VAULTWARDEN_RETRY_COOLDOWN_SEC + value: "1800" + - name: VAULTWARDEN_FAILURE_BAILOUT + value: "2" + - name: ARIADNE_PROVISION_POLL_INTERVAL_SEC + value: "5" + - name: ARIADNE_PROVISION_RETRY_COOLDOWN_SEC + value: "30" + - name: ARIADNE_SCHEDULE_TICK_SEC + value: "5" + - name: ARIADNE_SCHEDULE_MAILU_SYNC + value: "30 4 * * *" + - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC + value: "0 5 * * *" + - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC + value: "*/15 * * * *" + - name: ARIADNE_SCHEDULE_WGER_ADMIN + value: "15 3 * * *" + - name: WELCOME_EMAIL_ENABLED + value: "true" + - name: K8S_API_TIMEOUT_SEC + value: "5" + - name: METRICS_PATH + value: "/metrics" + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 10 diff --git a/services/maintenance/ariadne-service.yaml b/services/maintenance/ariadne-service.yaml new file mode 100644 index 0000000..9c93e1d --- /dev/null +++ b/services/maintenance/ariadne-service.yaml @@ -0,0 +1,13 @@ +# services/maintenance/ariadne-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: ariadne + namespace: maintenance +spec: + selector: + app: ariadne + ports: + - name: http + port: 80 + targetPort: http diff --git a/services/maintenance/ariadne-serviceaccount.yaml b/services/maintenance/ariadne-serviceaccount.yaml new file mode 100644 index 0000000..9adcef7 --- /dev/null +++ b/services/maintenance/ariadne-serviceaccount.yaml @@ -0,0 +1,8 @@ +# services/maintenance/ariadne-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ariadne + namespace: maintenance +imagePullSecrets: + - name: harbor-regcred diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index e53ed3c..f0f3de5 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -3,10 +3,16 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - namespace.yaml + - secretproviderclass.yaml + - vault-serviceaccount.yaml + - vault-sync-deployment.yaml + - ariadne-serviceaccount.yaml - disable-k3s-traefik-serviceaccount.yaml - k3s-traefik-cleanup-rbac.yaml - node-nofile-serviceaccount.yaml - pod-cleaner-rbac.yaml + - ariadne-deployment.yaml + - ariadne-service.yaml - disable-k3s-traefik-daemonset.yaml - k3s-traefik-cleanup-job.yaml - node-nofile-daemonset.yaml diff --git a/services/maintenance/secretproviderclass.yaml b/services/maintenance/secretproviderclass.yaml new file mode 100644 index 0000000..dd95948 --- /dev/null +++ b/services/maintenance/secretproviderclass.yaml @@ -0,0 +1,21 @@ +# services/maintenance/secretproviderclass.yaml +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: maintenance-vault + namespace: maintenance +spec: + provider: vault + parameters: + vaultAddress: "http://vault.vault.svc.cluster.local:8200" + roleName: "maintenance" + objects: | + - objectName: "harbor-pull__dockerconfigjson" + secretPath: "kv/data/atlas/harbor-pull/maintenance" + secretKey: "dockerconfigjson" + secretObjects: + - secretName: harbor-regcred + type: kubernetes.io/dockerconfigjson + data: + - objectName: harbor-pull__dockerconfigjson + key: .dockerconfigjson diff --git a/services/maintenance/vault-serviceaccount.yaml b/services/maintenance/vault-serviceaccount.yaml new file mode 100644 index 0000000..f60b43e --- /dev/null +++ b/services/maintenance/vault-serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/maintenance/vault-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: maintenance-vault-sync + namespace: maintenance diff --git a/services/maintenance/vault-sync-deployment.yaml b/services/maintenance/vault-sync-deployment.yaml new file mode 100644 index 0000000..edc0456 --- /dev/null +++ b/services/maintenance/vault-sync-deployment.yaml @@ -0,0 +1,34 @@ +# services/maintenance/vault-sync-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: maintenance-vault-sync + namespace: maintenance +spec: + replicas: 1 + selector: + matchLabels: + app: maintenance-vault-sync + template: + metadata: + labels: + app: maintenance-vault-sync + spec: + serviceAccountName: maintenance-vault-sync + containers: + - name: sync + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - "sleep infinity" + volumeMounts: + - name: vault-secrets + mountPath: /vault/secrets + readOnly: true + volumes: + - name: vault-secrets + csi: + driver: secrets-store.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: maintenance-vault diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index 25cf3f8..c9c0c9a 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -321,6 +321,156 @@ } } ] + }, + { + "id": 7, + "type": "table", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 8, + "type": "table", + "title": "Ariadne Schedule Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 9, + "type": "table", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "expr": "ariadne_access_requests_total", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] } ], "time": { diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 80a7043..7746f16 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -330,6 +330,156 @@ data: } } ] + }, + { + "id": 7, + "type": "table", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 8, + "type": "table", + "title": "Ariadne Schedule Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 9, + "type": "table", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "expr": "ariadne_access_requests_total", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] } ], "time": { diff --git a/services/nextcloud-mail-sync/cronjob.yaml b/services/nextcloud-mail-sync/cronjob.yaml index 2073d76..6913b60 100644 --- a/services/nextcloud-mail-sync/cronjob.yaml +++ b/services/nextcloud-mail-sync/cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "0 5 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 3 failedJobsHistoryLimit: 1 diff --git a/services/nextcloud-mail-sync/portal-rbac.yaml b/services/nextcloud-mail-sync/portal-rbac.yaml index dc9a4e4..009b2e0 100644 --- a/services/nextcloud-mail-sync/portal-rbac.yaml +++ b/services/nextcloud-mail-sync/portal-rbac.yaml @@ -27,3 +27,16 @@ subjects: - kind: ServiceAccount name: bstein-dev-home namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-nextcloud-mail-sync +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-nextcloud-mail-sync +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index 202879f..ca94ac6 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -230,6 +230,8 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ "crypto/* harbor-pull/crypto" "" write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" +write_policy_and_role "maintenance" "maintenance" "ariadne" \ + "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret harbor-pull/maintenance" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ diff --git a/services/vaultwarden/ariadne-rbac.yaml b/services/vaultwarden/ariadne-rbac.yaml new file mode 100644 index 0000000..ee903ca --- /dev/null +++ b/services/vaultwarden/ariadne-rbac.yaml @@ -0,0 +1,28 @@ +# services/vaultwarden/ariadne-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: ariadne-vaultwarden-admin-reader + namespace: vaultwarden +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get"] + resourceNames: ["vaultwarden-admin"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-vaultwarden-admin-reader + namespace: vaultwarden +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: ariadne-vaultwarden-admin-reader +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/vaultwarden/kustomization.yaml b/services/vaultwarden/kustomization.yaml index c53cb1c..ca5ef26 100644 --- a/services/vaultwarden/kustomization.yaml +++ b/services/vaultwarden/kustomization.yaml @@ -5,6 +5,7 @@ namespace: vaultwarden resources: - namespace.yaml - serviceaccount.yaml + - ariadne-rbac.yaml - pvc.yaml - deployment.yaml - service.yaml From ff3ed195ac8cb1333a4c8d660085139bd7503010 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 19:02:14 -0300 Subject: [PATCH 002/416] chore: centralize harbor pull credentials --- .../longhorn/core/secretproviderclass.yaml | 2 +- .../bstein-dev-home/secretproviderclass.yaml | 2 +- services/comms/secretproviderclass.yaml | 2 +- .../crypto/xmr-miner/secretproviderclass.yaml | 2 +- services/harbor/secretproviderclass.yaml | 2 +- services/keycloak/secretproviderclass.yaml | 2 +- services/logging/secretproviderclass.yaml | 2 +- services/mailu/secretproviderclass.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 2 +- services/maintenance/image.yaml | 21 ++++++++++++++++++ services/maintenance/kustomization.yaml | 5 +++++ services/maintenance/secretproviderclass.yaml | 2 +- services/monitoring/secretproviderclass.yaml | 2 +- services/pegasus/secretproviderclass.yaml | 2 +- .../vault/scripts/vault_k8s_auth_configure.sh | 22 +++++++++---------- 15 files changed, 49 insertions(+), 23 deletions(-) create mode 100644 services/maintenance/image.yaml diff --git a/infrastructure/longhorn/core/secretproviderclass.yaml b/infrastructure/longhorn/core/secretproviderclass.yaml index 031d1d8..e292b86 100644 --- a/infrastructure/longhorn/core/secretproviderclass.yaml +++ b/infrastructure/longhorn/core/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "longhorn" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/longhorn" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: longhorn-registry diff --git a/services/bstein-dev-home/secretproviderclass.yaml b/services/bstein-dev-home/secretproviderclass.yaml index f330fe6..2fa714a 100644 --- a/services/bstein-dev-home/secretproviderclass.yaml +++ b/services/bstein-dev-home/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "bstein-dev-home" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/bstein-dev-home" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/comms/secretproviderclass.yaml b/services/comms/secretproviderclass.yaml index 69d4b2b..0a89552 100644 --- a/services/comms/secretproviderclass.yaml +++ b/services/comms/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "comms" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/comms" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/crypto/xmr-miner/secretproviderclass.yaml b/services/crypto/xmr-miner/secretproviderclass.yaml index a72097f..12e4ba1 100644 --- a/services/crypto/xmr-miner/secretproviderclass.yaml +++ b/services/crypto/xmr-miner/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "crypto" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/crypto" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/harbor/secretproviderclass.yaml b/services/harbor/secretproviderclass.yaml index 03fef95..636f6fa 100644 --- a/services/harbor/secretproviderclass.yaml +++ b/services/harbor/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "harbor" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/harbor" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/keycloak/secretproviderclass.yaml b/services/keycloak/secretproviderclass.yaml index 86cebd2..d4c094f 100644 --- a/services/keycloak/secretproviderclass.yaml +++ b/services/keycloak/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "sso" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/sso" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/logging/secretproviderclass.yaml b/services/logging/secretproviderclass.yaml index f5db15e..6ff642d 100644 --- a/services/logging/secretproviderclass.yaml +++ b/services/logging/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "logging" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/logging" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/mailu/secretproviderclass.yaml b/services/mailu/secretproviderclass.yaml index f58c69b..f9e281e 100644 --- a/services/mailu/secretproviderclass.yaml +++ b/services/mailu/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "mailu-mailserver" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/mailu-mailserver" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index fd2fb79..ee4884d 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -49,7 +49,7 @@ spec: node-role.kubernetes.io/worker: "true" containers: - name: ariadne - image: registry.bstein.dev/bstein/ariadne:0.1.0 + image: registry.bstein.dev/bstein/ariadne:0.1.0-0 imagePullPolicy: Always command: ["/bin/sh", "-c"] args: diff --git a/services/maintenance/image.yaml b/services/maintenance/image.yaml new file mode 100644 index 0000000..95acbd0 --- /dev/null +++ b/services/maintenance/image.yaml @@ -0,0 +1,21 @@ +# services/maintenance/image.yaml +apiVersion: image.toolkit.fluxcd.io/v1beta2 +kind: ImageRepository +metadata: + name: ariadne + namespace: maintenance +spec: + image: registry.bstein.dev/bstein/ariadne + interval: 1m0s +--- +apiVersion: image.toolkit.fluxcd.io/v1beta2 +kind: ImagePolicy +metadata: + name: ariadne + namespace: maintenance +spec: + imageRepositoryRef: + name: ariadne + policy: + semver: + range: ">=0.1.0-0" diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index f0f3de5..5e199a9 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - namespace.yaml + - image.yaml - secretproviderclass.yaml - vault-serviceaccount.yaml - vault-sync-deployment.yaml @@ -22,6 +23,10 @@ resources: - node-image-sweeper-daemonset.yaml - image-sweeper-cronjob.yaml +images: + - name: registry.bstein.dev/bstein/ariadne + newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:ariadne"} + configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance diff --git a/services/maintenance/secretproviderclass.yaml b/services/maintenance/secretproviderclass.yaml index dd95948..85df2af 100644 --- a/services/maintenance/secretproviderclass.yaml +++ b/services/maintenance/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "maintenance" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/maintenance" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/monitoring/secretproviderclass.yaml b/services/monitoring/secretproviderclass.yaml index 8a6c5fb..350d6aa 100644 --- a/services/monitoring/secretproviderclass.yaml +++ b/services/monitoring/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "monitoring" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/monitoring" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/pegasus/secretproviderclass.yaml b/services/pegasus/secretproviderclass.yaml index b4621a5..b8d1df9 100644 --- a/services/pegasus/secretproviderclass.yaml +++ b/services/pegasus/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "pegasus" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/jellyfin" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index ca94ac6..c7eaf85 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -203,42 +203,42 @@ write_policy_and_role "outline" "outline" "outline-vault" \ write_policy_and_role "planka" "planka" "planka-vault" \ "planka/* shared/postmark-relay" "" write_policy_and_role "bstein-dev-home" "bstein-dev-home" "bstein-dev-home,bstein-dev-home-vault-sync" \ - "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay mailu/mailu-initial-account-secret harbor-pull/bstein-dev-home" "" + "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay mailu/mailu-initial-account-secret shared/harbor-pull" "" write_policy_and_role "gitea" "gitea" "gitea-vault" \ "gitea/*" "" write_policy_and_role "vaultwarden" "vaultwarden" "vaultwarden-vault" \ "vaultwarden/* mailu/mailu-initial-account-secret" "" write_policy_and_role "sso" "sso" "sso-vault,sso-vault-sync,mas-secrets-ensure" \ - "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin shared/portal-e2e-client shared/postmark-relay harbor-pull/sso" "" + "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin shared/portal-e2e-client shared/postmark-relay shared/harbor-pull" "" write_policy_and_role "mailu-mailserver" "mailu-mailserver" "mailu-vault-sync" \ - "mailu/* shared/postmark-relay harbor-pull/mailu-mailserver" "" + "mailu/* shared/postmark-relay shared/harbor-pull" "" write_policy_and_role "harbor" "harbor" "harbor-vault-sync" \ - "harbor/* harbor-pull/harbor" "" + "harbor/* shared/harbor-pull" "" write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \ "nextcloud/* shared/keycloak-admin shared/postmark-relay" "" write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \ - "comms/* shared/chat-ai-keys-runtime harbor-pull/comms" "" + "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" "" write_policy_and_role "jenkins" "jenkins" "jenkins" \ "jenkins/*" "" write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \ - "monitoring/* shared/postmark-relay harbor-pull/monitoring" "" + "monitoring/* shared/postmark-relay shared/harbor-pull" "" write_policy_and_role "logging" "logging" "logging-vault-sync" \ - "logging/* harbor-pull/logging" "" + "logging/* shared/harbor-pull" "" write_policy_and_role "pegasus" "jellyfin" "pegasus-vault-sync" \ - "pegasus/* harbor-pull/jellyfin" "" + "pegasus/* shared/harbor-pull" "" write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ - "crypto/* harbor-pull/crypto" "" + "crypto/* shared/harbor-pull" "" write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" write_policy_and_role "maintenance" "maintenance" "ariadne" \ - "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret harbor-pull/maintenance" "" + "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ "" \ "finance/*" write_policy_and_role "longhorn" "longhorn-system" "longhorn-vault,longhorn-vault-sync" \ - "longhorn/* harbor-pull/longhorn" "" + "longhorn/* shared/harbor-pull" "" write_policy_and_role "postgres" "postgres" "postgres-vault" \ "postgres/postgres-db" "" write_policy_and_role "vault" "vault" "vault" \ From 61619ddf77e20f5abcc5a9d673d4d957848d7e08 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 19:07:00 -0300 Subject: [PATCH 003/416] fix: allow maintenance vault sync role --- services/vault/scripts/vault_k8s_auth_configure.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index c7eaf85..a5ccb61 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -230,7 +230,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ "crypto/* shared/harbor-pull" "" write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" -write_policy_and_role "maintenance" "maintenance" "ariadne" \ +write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" From a0c3b9f9530c3abd31662bb806424fe1f6dab767 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 19:22:53 -0300 Subject: [PATCH 004/416] feat: wire portal to ariadne --- services/bstein-dev-home/backend-deployment.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 376622c..f3bca95 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -91,6 +91,10 @@ spec: value: atlas - name: KEYCLOAK_ADMIN_CLIENT_ID value: bstein-dev-home-admin + - name: ARIADNE_URL + value: http://ariadne.maintenance.svc.cluster.local:8080 + - name: ARIADNE_TIMEOUT_SEC + value: "10" - name: ACCOUNT_ALLOWED_GROUPS value: "" - name: HTTP_CHECK_TIMEOUT_SEC From fbdf53a9a81d074787a1da9e6d5c5c8c7a1b2ed8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:03:50 -0300 Subject: [PATCH 005/416] chore: add maintenance image automation --- .../bstein-dev-home/image-automation.yaml | 4 +-- .../flux-system/platform/kustomization.yaml | 1 + .../maintenance/image-automation.yaml | 26 +++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 clusters/atlas/flux-system/platform/maintenance/image-automation.yaml diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml index 88dda40..643d479 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml @@ -13,14 +13,14 @@ spec: git: checkout: ref: - branch: feature/vault-consumption + branch: feature/ariadne commit: author: email: ops@bstein.dev name: flux-bot messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}" push: - branch: feature/vault-consumption + branch: feature/ariadne update: strategy: Setters path: services/bstein-dev-home diff --git a/clusters/atlas/flux-system/platform/kustomization.yaml b/clusters/atlas/flux-system/platform/kustomization.yaml index b689cc0..6e75b04 100644 --- a/clusters/atlas/flux-system/platform/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/kustomization.yaml @@ -11,6 +11,7 @@ resources: - monitoring/kustomization.yaml - logging/kustomization.yaml - maintenance/kustomization.yaml + - maintenance/image-automation.yaml - longhorn-adopt/kustomization.yaml - longhorn/kustomization.yaml - longhorn-ui/kustomization.yaml diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml new file mode 100644 index 0000000..867cae4 --- /dev/null +++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml @@ -0,0 +1,26 @@ +# clusters/atlas/flux-system/platform/maintenance/image-automation.yaml +apiVersion: image.toolkit.fluxcd.io/v1 +kind: ImageUpdateAutomation +metadata: + name: maintenance + namespace: flux-system +spec: + interval: 1m0s + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + git: + checkout: + ref: + branch: feature/ariadne + commit: + author: + email: ops@bstein.dev + name: flux-bot + messageTemplate: "chore(maintenance): update images to {{range .Updated.Images}}{{.}}{{end}}" + push: + branch: feature/ariadne + update: + strategy: Setters + path: services/maintenance From 776aea25f53d6878a13c7c42a97d33164075aeba Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:11:38 -0300 Subject: [PATCH 006/416] bstein-dev-home: bump images to 0.1.1-107 --- services/bstein-dev-home/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f9d3c87..ec137dc 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,9 +20,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-102 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"} + newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-103 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"} + newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 7171e5a9ea42d28e1f2e3f78707b9ecc5b2a4a09 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:33:34 -0300 Subject: [PATCH 007/416] fix: unblock keycloak and refresh glue checks --- ci/tests/glue/config.yaml | 9 +++++++++ ci/tests/glue/test_glue_metrics.py | 19 +++++++++++++++++++ services/jenkins/configmap-jcasc.yaml | 2 +- services/keycloak/realm-settings-job.yaml | 2 +- 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/ci/tests/glue/config.yaml b/ci/tests/glue/config.yaml index 8adf4ca..16b656c 100644 --- a/ci/tests/glue/config.yaml +++ b/ci/tests/glue/config.yaml @@ -1,7 +1,16 @@ max_success_age_hours: 48 allow_suspended: + - bstein-dev-home/vaultwarden-cred-sync - comms/othrys-room-reset - comms/pin-othrys-invite - comms/seed-othrys-room - finance/firefly-user-sync + - health/wger-admin-ensure - health/wger-user-sync + - mailu-mailserver/mailu-sync-nightly + - nextcloud/nextcloud-mail-sync +ariadne_schedule_tasks: + - schedule.mailu_sync + - schedule.nextcloud_sync + - schedule.vaultwarden_sync + - schedule.wger_admin diff --git a/ci/tests/glue/test_glue_metrics.py b/ci/tests/glue/test_glue_metrics.py index 16b01c7..52ec0be 100644 --- a/ci/tests/glue/test_glue_metrics.py +++ b/ci/tests/glue/test_glue_metrics.py @@ -1,11 +1,19 @@ from __future__ import annotations import os +from pathlib import Path import requests +import yaml VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/") +CONFIG_PATH = Path(__file__).with_name("config.yaml") + + +def _load_config() -> dict: + with CONFIG_PATH.open("r", encoding="utf-8") as handle: + return yaml.safe_load(handle) or {} def _query(promql: str) -> list[dict]: @@ -27,3 +35,14 @@ def test_glue_metrics_success_join(): ) series = _query(query) assert series, "No glue cronjob last success series found" + + +def test_ariadne_schedule_metrics_present(): + cfg = _load_config() + expected = cfg.get("ariadne_schedule_tasks", []) + if not expected: + return + series = _query("ariadne_schedule_next_run_timestamp_seconds") + tasks = {item.get("metric", {}).get("task") for item in series} + missing = [task for task in expected if task not in tasks] + assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}" diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index ac26350..25dd748 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -151,7 +151,7 @@ data: url('https://scm.bstein.dev/bstein/titan-iac.git') credentials('gitea-pat') } - branches('*/feature/vault-consumption') + branches('*/main') } } scriptPath('ci/Jenkinsfile.titan-iac') diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index a0b36ec..fdee377 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: keycloak-realm-settings-32 + name: keycloak-realm-settings-33 namespace: sso spec: backoffLimit: 0 From cff3ed075951395857bc4b64b660e27b1c51fc0b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:35:29 -0300 Subject: [PATCH 008/416] chore: run portal onboarding e2e job --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index f22272e..201e3f5 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-19 + name: portal-onboarding-e2e-test-20 namespace: bstein-dev-home spec: backoffLimit: 0 From c7e81674b0c6ce8d5fa09c98af0f8c3c0684e8db Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:38:22 -0300 Subject: [PATCH 009/416] fix: point portal at ariadne service --- services/bstein-dev-home/backend-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index f3bca95..074a19d 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -92,7 +92,7 @@ spec: - name: KEYCLOAK_ADMIN_CLIENT_ID value: bstein-dev-home-admin - name: ARIADNE_URL - value: http://ariadne.maintenance.svc.cluster.local:8080 + value: http://ariadne.maintenance.svc.cluster.local - name: ARIADNE_TIMEOUT_SEC value: "10" - name: ACCOUNT_ALLOWED_GROUPS From 67e422f56fed618fd3d31c1ba905f24d170d2912 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:42:14 -0300 Subject: [PATCH 010/416] chore: rerun portal onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index 201e3f5..0b65090 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-20 + name: portal-onboarding-e2e-test-21 namespace: bstein-dev-home spec: backoffLimit: 0 From c1f0ea421d5dbfacb283e2f1dcd2fc0777e33497 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:49:23 -0300 Subject: [PATCH 011/416] fix: extend mailu mailbox wait for ariadne --- services/maintenance/ariadne-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index ee4884d..0543f80 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -93,7 +93,7 @@ spec: - name: MAILU_SYNC_URL value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC - value: "60" + value: "180" - name: MAILU_DB_HOST value: postgres-service.postgres.svc.cluster.local - name: MAILU_DB_PORT From 4b0d8fb301aac1cf009e06affef02cb350c01e95 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:04:59 -0300 Subject: [PATCH 012/416] chore(maintenance): bump ariadne image tag --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 5e199a9..e09f6a8 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-1 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From 901f3e797c85abfe68fa4bc57698e8b71eb7f4b0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:05:46 -0300 Subject: [PATCH 013/416] chore(portal): rerun onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index 0b65090..c9c1c04 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-21 + name: portal-onboarding-e2e-test-22 namespace: bstein-dev-home spec: backoffLimit: 0 From 9ff88f7f132aa7b7ae208e461d785d7400f91395 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:28:07 -0300 Subject: [PATCH 014/416] fix(mailu): allow forced sync --- services/mailu/mailu-sync-listener.yaml | 2 +- services/mailu/scripts/mailu_sync_listener.py | 10 ++++++---- services/maintenance/kustomization.yaml | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/services/mailu/mailu-sync-listener.yaml b/services/mailu/mailu-sync-listener.yaml index cc98107..b3d2acc 100644 --- a/services/mailu/mailu-sync-listener.yaml +++ b/services/mailu/mailu-sync-listener.yaml @@ -30,7 +30,7 @@ spec: app: mailu-sync-listener annotations: vault.hashicorp.com/agent-inject: "true" - atlas.bstein.dev/mailu-sync-rev: "2" + atlas.bstein.dev/mailu-sync-rev: "3" vault.hashicorp.com/role: "mailu-mailserver" vault.hashicorp.com/agent-inject-secret-mailu-db-secret__database: "kv/data/atlas/mailu/mailu-db-secret" vault.hashicorp.com/agent-inject-template-mailu-db-secret__database: | diff --git a/services/mailu/scripts/mailu_sync_listener.py b/services/mailu/scripts/mailu_sync_listener.py index 6ac0da7..4e31c81 100644 --- a/services/mailu/scripts/mailu_sync_listener.py +++ b/services/mailu/scripts/mailu_sync_listener.py @@ -39,12 +39,12 @@ def _run_sync_blocking() -> int: sync_done.set() -def _trigger_sync_async() -> bool: +def _trigger_sync_async(force: bool = False) -> bool: with lock: now = time() if sync_running: return False - if now - last_run < MIN_INTERVAL_SECONDS: + if not force and now - last_run < MIN_INTERVAL_SECONDS: return False thread = threading.Thread(target=_run_sync_blocking, daemon=True) @@ -64,15 +64,17 @@ class Handler(http.server.BaseHTTPRequestHandler): return wait = False + force = False if isinstance(payload, dict): wait = bool(payload.get("wait")) + force = bool(payload.get("force")) if wait: with lock: already_running = sync_running if not already_running: - _trigger_sync_async() + _trigger_sync_async(force=force) sync_done.wait(timeout=WAIT_TIMEOUT_SECONDS) with lock: @@ -87,7 +89,7 @@ class Handler(http.server.BaseHTTPRequestHandler): self.end_headers() return - _trigger_sync_async() + _trigger_sync_async(force=force) self.send_response(202) self.end_headers() diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index e09f6a8..9255d88 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-1 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-2 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From 84cd05b08a8b5c507d7dbe48e9668cb1da05eb8e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:31:45 -0300 Subject: [PATCH 015/416] chore(portal): rerun onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index c9c1c04..9dbe68d 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-22 + name: portal-onboarding-e2e-test-23 namespace: bstein-dev-home spec: backoffLimit: 0 From 34c42cfb6278a0ad6f07128b1f882b55e7a1e8a5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:45:31 -0300 Subject: [PATCH 016/416] core: fix postmark DNS and time sync --- infrastructure/core/coredns-custom.yaml | 3 ++ infrastructure/core/kustomization.yaml | 1 + infrastructure/core/ntp-sync-daemonset.yaml | 50 +++++++++++++++++++ .../postmark-exporter-deployment.yaml | 4 +- 4 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 infrastructure/core/ntp-sync-daemonset.yaml diff --git a/infrastructure/core/coredns-custom.yaml b/infrastructure/core/coredns-custom.yaml index 8aeff14..6266a22 100644 --- a/infrastructure/core/coredns-custom.yaml +++ b/infrastructure/core/coredns-custom.yaml @@ -32,6 +32,9 @@ data: 192.168.22.9 notes.bstein.dev 192.168.22.9 office.bstein.dev 192.168.22.9 pegasus.bstein.dev + 3.136.224.193 pm-bounces.bstein.dev + 3.150.68.49 pm-bounces.bstein.dev + 18.189.137.81 pm-bounces.bstein.dev 192.168.22.9 registry.bstein.dev 192.168.22.9 scm.bstein.dev 192.168.22.9 secret.bstein.dev diff --git a/infrastructure/core/kustomization.yaml b/infrastructure/core/kustomization.yaml index 6286186..257e1f0 100644 --- a/infrastructure/core/kustomization.yaml +++ b/infrastructure/core/kustomization.yaml @@ -6,5 +6,6 @@ resources: - ../modules/profiles/atlas-ha - coredns-custom.yaml - coredns-deployment.yaml + - ntp-sync-daemonset.yaml - ../sources/cert-manager/letsencrypt.yaml - ../sources/cert-manager/letsencrypt-prod.yaml diff --git a/infrastructure/core/ntp-sync-daemonset.yaml b/infrastructure/core/ntp-sync-daemonset.yaml new file mode 100644 index 0000000..ba97294 --- /dev/null +++ b/infrastructure/core/ntp-sync-daemonset.yaml @@ -0,0 +1,50 @@ +# infrastructure/core/ntp-sync-daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: ntp-sync + namespace: kube-system + labels: + app: ntp-sync +spec: + selector: + matchLabels: + app: ntp-sync + template: + metadata: + labels: + app: ntp-sync + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist + - key: node-role.kubernetes.io/master + operator: DoesNotExist + containers: + - name: ntp-sync + image: public.ecr.aws/docker/library/busybox:1.36.1 + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "-c"] + args: + - | + set -eu + while true; do + ntpd -q -p pool.ntp.org || true + sleep 300 + done + securityContext: + capabilities: + add: ["SYS_TIME"] + runAsUser: 0 + runAsGroup: 0 + resources: + requests: + cpu: 10m + memory: 16Mi + limits: + cpu: 50m + memory: 64Mi diff --git a/services/monitoring/postmark-exporter-deployment.yaml b/services/monitoring/postmark-exporter-deployment.yaml index 6406224..98791d9 100644 --- a/services/monitoring/postmark-exporter-deployment.yaml +++ b/services/monitoring/postmark-exporter-deployment.yaml @@ -18,9 +18,9 @@ spec: prometheus.io/path: "/metrics" vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "monitoring" - vault.hashicorp.com/agent-inject-secret-postmark-env: "kv/data/atlas/monitoring/postmark-exporter" + vault.hashicorp.com/agent-inject-secret-postmark-env: "kv/data/atlas/shared/postmark-relay" vault.hashicorp.com/agent-inject-template-postmark-env: | - {{- with secret "kv/data/atlas/monitoring/postmark-exporter" -}} + {{- with secret "kv/data/atlas/shared/postmark-relay" -}} export POSTMARK_SERVER_TOKEN="{{ index .Data.data "apikey" }}" export POSTMARK_SERVER_TOKEN_FALLBACK="{{ index .Data.data "apikey" }}" {{- if index .Data.data "sending-limit" }} From 1b2243e2a8045abc568bb194edddabb114e8454f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:45:48 -0300 Subject: [PATCH 017/416] chore(maintenance): bump ariadne image tag --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 9255d88..35af46f 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-2 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-3 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From cffe53edbe21b212466b7ee7b81947faa903ca0b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:47:24 -0300 Subject: [PATCH 018/416] chore(portal): rerun onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index 9dbe68d..535b1dc 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-23 + name: portal-onboarding-e2e-test-24 namespace: bstein-dev-home spec: backoffLimit: 0 From 7d999cc6c6db2b27fcf0e99e40d8216c992119cb Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:51:55 -0300 Subject: [PATCH 019/416] fix(mailu): pin sync workloads to arm64 --- services/mailu/mailu-sync-cronjob.yaml | 3 +++ services/mailu/mailu-sync-listener.yaml | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/services/mailu/mailu-sync-cronjob.yaml b/services/mailu/mailu-sync-cronjob.yaml index 671439d..bbe9909 100644 --- a/services/mailu/mailu-sync-cronjob.yaml +++ b/services/mailu/mailu-sync-cronjob.yaml @@ -38,6 +38,9 @@ spec: {{- with secret "kv/data/atlas/mailu/mailu-initial-account-secret" -}}{{ .Data.data.password }}{{- end -}} spec: restartPolicy: OnFailure + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" serviceAccountName: mailu-vault-sync containers: - name: mailu-sync diff --git a/services/mailu/mailu-sync-listener.yaml b/services/mailu/mailu-sync-listener.yaml index b3d2acc..0644c5b 100644 --- a/services/mailu/mailu-sync-listener.yaml +++ b/services/mailu/mailu-sync-listener.yaml @@ -30,7 +30,7 @@ spec: app: mailu-sync-listener annotations: vault.hashicorp.com/agent-inject: "true" - atlas.bstein.dev/mailu-sync-rev: "3" + atlas.bstein.dev/mailu-sync-rev: "4" vault.hashicorp.com/role: "mailu-mailserver" vault.hashicorp.com/agent-inject-secret-mailu-db-secret__database: "kv/data/atlas/mailu/mailu-db-secret" vault.hashicorp.com/agent-inject-template-mailu-db-secret__database: | @@ -52,6 +52,9 @@ spec: {{- with secret "kv/data/atlas/mailu/mailu-initial-account-secret" -}}{{ .Data.data.password }}{{- end -}} spec: restartPolicy: Always + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" serviceAccountName: mailu-vault-sync containers: - name: listener From 278b4541a25c9e84b088e2806505d45bfec5b848 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:58:37 -0300 Subject: [PATCH 020/416] chore(portal): rerun onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index 535b1dc..505e181 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-24 + name: portal-onboarding-e2e-test-25 namespace: bstein-dev-home spec: backoffLimit: 0 From 8be01698a989cebd33d6fbe32d64412b3abe7b24 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 00:07:45 -0300 Subject: [PATCH 021/416] chore(maintenance): bump ariadne image tag --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 35af46f..80c61df 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-3 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-4 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From f527da9cdb5ca0b8388838b13c3cc705c05d889d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 00:09:49 -0300 Subject: [PATCH 022/416] chore(portal): rerun onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index 505e181..a0b6569 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-25 + name: portal-onboarding-e2e-test-26 namespace: bstein-dev-home spec: backoffLimit: 0 From a86d68ca748d6270d2e5d0e6ad0fd9d735af7ced Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 00:58:04 -0300 Subject: [PATCH 023/416] mailu: use postmark server token for relay --- services/mailu/helmrelease.yaml | 60 ++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 7342141..9779aed 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -335,8 +335,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -397,8 +403,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -459,8 +471,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -521,8 +539,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -583,8 +607,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -645,8 +675,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync From a8be46b422fb42e9696d4147928e54f8cb97df6b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:04:04 -0300 Subject: [PATCH 024/416] mailu: prefer postmark smtp token for relay --- services/mailu/helmrelease.yaml | 78 ++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 9779aed..4621a2d 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -335,13 +335,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: @@ -403,13 +404,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: @@ -471,13 +473,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: @@ -539,13 +542,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: @@ -607,13 +611,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: @@ -675,13 +680,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: From 3e0260b9456d00e30dfe92c028da90ac41cecc42 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:05:06 -0300 Subject: [PATCH 025/416] ci: pin quality gate agents to rpi5 --- ci/Jenkinsfile.titan-iac | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/Jenkinsfile.titan-iac b/ci/Jenkinsfile.titan-iac index 3b13eb0..359dc94 100644 --- a/ci/Jenkinsfile.titan-iac +++ b/ci/Jenkinsfile.titan-iac @@ -6,6 +6,10 @@ pipeline { apiVersion: v1 kind: Pod spec: + nodeSelector: + hardware: rpi5 + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" containers: - name: python image: python:3.12-slim From cac8a3cdded02b9bf916c71bdea4c4a95ddbdff8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:07:01 -0300 Subject: [PATCH 026/416] mailu: recreate postfix on upgrade --- services/mailu/helmrelease.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 4621a2d..e84b376 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -455,6 +455,8 @@ spec: metadata: name: mailu-postfix spec: + strategy: + type: Recreate template: metadata: annotations: From cfcda87f678f33e45dcd8847a304ee1a64cd4a97 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:08:51 -0300 Subject: [PATCH 027/416] jenkins: re-target quality gate and restart --- services/jenkins/configmap-jcasc.yaml | 2 +- services/jenkins/deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 25dd748..0a25aa1 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -75,7 +75,7 @@ data: url('https://scm.bstein.dev/bstein/titan-iac.git') credentials('gitea-pat') } - branches('*/main') + branches('*/feature/ariadne') } } scriptPath('services/jellyfin/oidc/Jenkinsfile') diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index e846a8e..0e99cab 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -38,7 +38,7 @@ spec: GITEA_PAT_USERNAME={{ .Data.data.username }} GITEA_PAT_TOKEN={{ .Data.data.token }} {{- end -}} - bstein.dev/restarted-at: "2026-01-19T00:25:00Z" + bstein.dev/restarted-at: "2026-01-20T04:08:33Z" spec: serviceAccountName: jenkins nodeSelector: From 14864a3b8c785368b1fc044657102e46b9d97463 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:14:30 -0300 Subject: [PATCH 028/416] jenkins: align quality gate branch --- services/jenkins/configmap-jcasc.yaml | 4 ++-- services/jenkins/deployment.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 0a25aa1..fcd01f9 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -75,7 +75,7 @@ data: url('https://scm.bstein.dev/bstein/titan-iac.git') credentials('gitea-pat') } - branches('*/feature/ariadne') + branches('*/main') } } scriptPath('services/jellyfin/oidc/Jenkinsfile') @@ -151,7 +151,7 @@ data: url('https://scm.bstein.dev/bstein/titan-iac.git') credentials('gitea-pat') } - branches('*/main') + branches('*/feature/ariadne') } } scriptPath('ci/Jenkinsfile.titan-iac') diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 0e99cab..dfbe5fe 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -38,7 +38,7 @@ spec: GITEA_PAT_USERNAME={{ .Data.data.username }} GITEA_PAT_TOKEN={{ .Data.data.token }} {{- end -}} - bstein.dev/restarted-at: "2026-01-20T04:08:33Z" + bstein.dev/restarted-at: "2026-01-20T04:14:13Z" spec: serviceAccountName: jenkins nodeSelector: From 34fb371270222069354d250dbbe55105257e6a7c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:20:16 -0300 Subject: [PATCH 029/416] portal: rerun onboarding e2e job --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index a0b6569..681e89d 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-26 + name: portal-onboarding-e2e-test-27 namespace: bstein-dev-home spec: backoffLimit: 0 From 2bbbf019ff030d172f698df0845c5bd5cf08f6f1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 02:30:44 -0300 Subject: [PATCH 030/416] mailu: rewrite double-bounce to base domain --- services/mailu/helmrelease.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index e84b376..7eab19a 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -219,6 +219,8 @@ spec: overrides: postfix.cf: | mynetworks = 127.0.0.0/8 [::1]/128 10.42.0.0/16 10.43.0.0/16 192.168.22.0/24 + recipient_canonical_maps = regexp:/overrides/recipient_canonical, ${podop}recipientmap + recipient_canonical_classes = envelope_recipient,header_recipient smtpd_delay_reject = yes smtpd_helo_required = yes smtpd_helo_restrictions = reject_invalid_helo_hostname, reject_non_fqdn_helo_hostname, reject_unknown_helo_hostname @@ -238,6 +240,8 @@ spec: smtpd_client_message_rate_limit = 100 smtpd_client_recipient_rate_limit = 200 smtpd_recipient_limit = 100 + recipient_canonical: | + /^double-bounce@mail\.bstein\.dev$/ double-bounce@bstein.dev podAnnotations: bstein.dev/restarted-at: "2026-01-06T00:00:00Z" redis: From d996bda2c186847fefd668b4e84987c583d1c72d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 02:32:43 -0300 Subject: [PATCH 031/416] mailu: restart postfix to load canonical map --- services/mailu/helmrelease.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 7eab19a..599faf1 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -243,7 +243,7 @@ spec: recipient_canonical: | /^double-bounce@mail\.bstein\.dev$/ double-bounce@bstein.dev podAnnotations: - bstein.dev/restarted-at: "2026-01-06T00:00:00Z" + bstein.dev/restarted-at: "2026-01-20T04:20:00Z" redis: enabled: true architecture: standalone From 814d1ce211969bd485fbc9a33354a0270ca39f11 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 02:37:02 -0300 Subject: [PATCH 032/416] mailu: keep podop socketmap in canonical maps --- services/mailu/helmrelease.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 599faf1..9d8519b 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -219,7 +219,7 @@ spec: overrides: postfix.cf: | mynetworks = 127.0.0.0/8 [::1]/128 10.42.0.0/16 10.43.0.0/16 192.168.22.0/24 - recipient_canonical_maps = regexp:/overrides/recipient_canonical, ${podop}recipientmap + recipient_canonical_maps = regexp:/overrides/recipient_canonical, socketmap:unix:/tmp/podop.socket:recipientmap recipient_canonical_classes = envelope_recipient,header_recipient smtpd_delay_reject = yes smtpd_helo_required = yes From 95a7ac235f732cd918aa371184f87e1e60e7fdf8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 02:38:04 -0300 Subject: [PATCH 033/416] mailu: restart postfix after canonical map update --- services/mailu/helmrelease.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 9d8519b..2a7e6f5 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -243,7 +243,7 @@ spec: recipient_canonical: | /^double-bounce@mail\.bstein\.dev$/ double-bounce@bstein.dev podAnnotations: - bstein.dev/restarted-at: "2026-01-20T04:20:00Z" + bstein.dev/restarted-at: "2026-01-20T04:35:00Z" redis: enabled: true architecture: standalone From e2e7e58f32c96db180733c83cd899a61d802beed Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 03:01:59 -0300 Subject: [PATCH 034/416] maintenance: extend Ariadne schedules and RBAC --- services/comms/guest-name-job.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 34 ++++++++++++++++++++ services/maintenance/ariadne-rbac.yaml | 29 +++++++++++++++++ services/maintenance/kustomization.yaml | 1 + services/vault/k8s-auth-config-cronjob.yaml | 1 + services/vault/oidc-config-cronjob.yaml | 1 + 6 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 services/maintenance/ariadne-rbac.yaml diff --git a/services/comms/guest-name-job.yaml b/services/comms/guest-name-job.yaml index 21a8af5..3eae2dd 100644 --- a/services/comms/guest-name-job.yaml +++ b/services/comms/guest-name-job.yaml @@ -8,7 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/1 * * * *" - suspend: false + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 1 diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 0543f80..cd0d38c 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -78,6 +78,8 @@ spec: value: bstein-dev-home-admin - name: PORTAL_PUBLIC_BASE_URL value: https://bstein.dev + - name: ARIADNE_LOG_LEVEL + value: INFO - name: PORTAL_ADMIN_USERS value: bstein - name: PORTAL_ADMIN_GROUPS @@ -120,6 +122,26 @@ spec: value: firefly-user-sync - name: FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC value: "90" + - name: VAULT_NAMESPACE + value: vault + - name: VAULT_K8S_AUTH_CRONJOB + value: vault-k8s-auth-config + - name: VAULT_OIDC_CRONJOB + value: vault-oidc-config + - name: VAULT_JOB_WAIT_TIMEOUT_SEC + value: "120" + - name: COMMS_NAMESPACE + value: comms + - name: COMMS_GUEST_NAME_CRONJOB + value: guest-name-randomizer + - name: COMMS_PIN_INVITE_CRONJOB + value: pin-othrys-invite + - name: COMMS_RESET_ROOM_CRONJOB + value: othrys-room-reset + - name: COMMS_SEED_ROOM_CRONJOB + value: seed-othrys-room + - name: COMMS_JOB_WAIT_TIMEOUT_SEC + value: "60" - name: VAULTWARDEN_NAMESPACE value: vaultwarden - name: VAULTWARDEN_POD_LABEL @@ -154,6 +176,18 @@ spec: value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_WGER_ADMIN value: "15 3 * * *" + - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH + value: "*/15 * * * *" + - name: ARIADNE_SCHEDULE_VAULT_OIDC + value: "*/15 * * * *" + - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME + value: "*/1 * * * *" + - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE + value: "*/30 * * * *" + - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM + value: "0 0 1 1 *" + - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM + value: "*/10 * * * *" - name: WELCOME_EMAIL_ENABLED value: "true" - name: K8S_API_TIMEOUT_SEC diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml new file mode 100644 index 0000000..8d2a2a9 --- /dev/null +++ b/services/maintenance/ariadne-rbac.yaml @@ -0,0 +1,29 @@ +# services/maintenance/ariadne-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: ariadne-job-spawner +rules: + - apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: + - get + - list + - watch + - create + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: ariadne-job-spawner +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: ariadne-job-spawner diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 80c61df..0810f5e 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -8,6 +8,7 @@ resources: - vault-serviceaccount.yaml - vault-sync-deployment.yaml - ariadne-serviceaccount.yaml + - ariadne-rbac.yaml - disable-k3s-traefik-serviceaccount.yaml - k3s-traefik-cleanup-rbac.yaml - node-nofile-serviceaccount.yaml diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml index 29e8e80..e7cca14 100644 --- a/services/vault/k8s-auth-config-cronjob.yaml +++ b/services/vault/k8s-auth-config-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/vault/oidc-config-cronjob.yaml b/services/vault/oidc-config-cronjob.yaml index 013c9f3..4d317c5 100644 --- a/services/vault/oidc-config-cronjob.yaml +++ b/services/vault/oidc-config-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 From 5fe8866623c02326e805e5f3c59cdeff0ebf242e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 03:21:36 -0300 Subject: [PATCH 035/416] ci(jenkins): add multibranch quality gate --- ci/Jenkinsfile.titan-iac | 23 +++++++++++++++-- services/jenkins/configmap-jcasc.yaml | 33 ++++++++++++++++--------- services/jenkins/configmap-plugins.yaml | 1 + services/maintenance/kustomization.yaml | 2 +- 4 files changed, 44 insertions(+), 15 deletions(-) diff --git a/ci/Jenkinsfile.titan-iac b/ci/Jenkinsfile.titan-iac index 359dc94..77990d7 100644 --- a/ci/Jenkinsfile.titan-iac +++ b/ci/Jenkinsfile.titan-iac @@ -22,7 +22,6 @@ spec: environment { PIP_DISABLE_PIP_VERSION_CHECK = '1' PYTHONUNBUFFERED = '1' - DEPLOY_BRANCH = 'deploy' } stages { stage('Checkout') { @@ -40,7 +39,27 @@ spec: sh 'pytest -q ci/tests/glue' } } + stage('Resolve Flux branch') { + steps { + script { + env.FLUX_BRANCH = sh( + returnStdout: true, + script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml" + ).trim() + if (!env.FLUX_BRANCH) { + error('Flux branch not found in gotk-sync.yaml') + } + echo "Flux branch: ${env.FLUX_BRANCH}" + } + } + } stage('Promote') { + when { + expression { + def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '') + return env.FLUX_BRANCH && branch == env.FLUX_BRANCH + } + } steps { withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) { sh ''' @@ -48,7 +67,7 @@ spec: git config user.email "jenkins@bstein.dev" git config user.name "jenkins" git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git - git push origin HEAD:${DEPLOY_BRANCH} + git push origin HEAD:${FLUX_BRANCH} ''' } } diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index fcd01f9..62012f1 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -139,24 +139,33 @@ data: } } } - pipelineJob('titan-iac-quality-gate') { - triggers { - scm('H/5 * * * *') - } - definition { - cpsScm { - scm { + multibranchPipelineJob('titan-iac-quality-gate') { + branchSources { + branchSource { + source { git { - remote { - url('https://scm.bstein.dev/bstein/titan-iac.git') - credentials('gitea-pat') - } - branches('*/feature/ariadne') + id('titan-iac-quality-gate') + remote('https://scm.bstein.dev/bstein/titan-iac.git') + credentialsId('gitea-pat') } } + } + } + factory { + workflowBranchProjectFactory { scriptPath('ci/Jenkinsfile.titan-iac') } } + orphanedItemStrategy { + discardOldItems { + numToKeep(30) + } + } + triggers { + periodicFolderTrigger { + interval('12h') + } + } } base.yaml: | jenkins: diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml index eabea13..108c646 100644 --- a/services/jenkins/configmap-plugins.yaml +++ b/services/jenkins/configmap-plugins.yaml @@ -9,6 +9,7 @@ data: kubernetes workflow-aggregator git + git-branch-source pipeline-utility-steps configuration-as-code configuration-as-code-support diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 0810f5e..b7fe46b 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-4 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-5 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From 557663f52474507a1f4ce3b8e22aa200a3af81d9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 03:30:48 -0300 Subject: [PATCH 036/416] ci(jenkins): add Ariadne pipeline job --- services/jenkins/configmap-jcasc.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 62012f1..78d98fe 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -120,6 +120,25 @@ data: } } } + pipelineJob('ariadne') { + triggers { + scm('H/2 * * * *') + } + definition { + cpsScm { + scm { + git { + remote { + url('https://scm.bstein.dev/bstein/ariadne.git') + credentials('gitea-pat') + } + branches('*/master') + } + } + scriptPath('Jenkinsfile') + } + } + } pipelineJob('data-prepper') { triggers { scm('H/5 * * * *') From 56b36330b22c81a713047013b867f33455e1ce89 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 03:59:19 -0300 Subject: [PATCH 037/416] glue: preserve keycloak profile updates --- services/mailu/scripts/mailu_sync.py | 32 ++++++++++++++++++++++++- services/maintenance/kustomization.yaml | 2 +- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/services/mailu/scripts/mailu_sync.py b/services/mailu/scripts/mailu_sync.py index 001917a..71b0f5a 100644 --- a/services/mailu/scripts/mailu_sync.py +++ b/services/mailu/scripts/mailu_sync.py @@ -130,7 +130,9 @@ def kc_update_attributes(token, user, attributes): if not isinstance(current_attrs, dict): current_attrs = {} current_attrs.update(attributes) - resp = SESSION.put(user_url, headers=headers, json={"attributes": current_attrs}, timeout=20) + payload = _safe_update_payload(current_payload) + payload["attributes"] = current_attrs + resp = SESSION.put(user_url, headers=headers, json=payload, timeout=20) resp.raise_for_status() verify = SESSION.get( user_url, @@ -144,6 +146,34 @@ def kc_update_attributes(token, user, attributes): raise Exception(f"attribute not persisted for {user.get('email') or user['username']}") +def _safe_update_payload(user_payload: dict) -> dict: + payload: dict = {} + username = user_payload.get("username") + if isinstance(username, str): + payload["username"] = username + enabled = user_payload.get("enabled") + if isinstance(enabled, bool): + payload["enabled"] = enabled + email = user_payload.get("email") + if isinstance(email, str): + payload["email"] = email + email_verified = user_payload.get("emailVerified") + if isinstance(email_verified, bool): + payload["emailVerified"] = email_verified + first_name = user_payload.get("firstName") + if isinstance(first_name, str): + payload["firstName"] = first_name + last_name = user_payload.get("lastName") + if isinstance(last_name, str): + payload["lastName"] = last_name + actions = user_payload.get("requiredActions") + if isinstance(actions, list): + payload["requiredActions"] = [a for a in actions if isinstance(a, str)] + attrs = user_payload.get("attributes") + payload["attributes"] = attrs if isinstance(attrs, dict) else {} + return payload + + def random_password(): alphabet = string.ascii_letters + string.digits return "".join(secrets.choice(alphabet) for _ in range(24)) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index b7fe46b..a86453e 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-5 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-6 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From 132074f0ff335deb7dc815429792d47090524375 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 09:06:39 -0300 Subject: [PATCH 038/416] gitea: allow jenkins webhook --- services/gitea/deployment.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/gitea/deployment.yaml b/services/gitea/deployment.yaml index 9dc0c87..da188c3 100644 --- a/services/gitea/deployment.yaml +++ b/services/gitea/deployment.yaml @@ -169,6 +169,8 @@ spec: value: "trace" - name: GITEA__service__REQUIRE_SIGNIN_VIEW value: "false" + - name: GITEA__webhook__ALLOWED_HOST_LIST + value: "ci.bstein.dev" - name: GITEA__server__PROXY_HEADERS value: "X-Forwarded-For, X-Forwarded-Proto, X-Forwarded-Host" - name: GITEA__session__COOKIE_SECURE From b8f2d00547fd97b3c752bb61726c4c3ccdf9463b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 09:37:21 -0300 Subject: [PATCH 039/416] jenkins: pin root url for OIDC --- services/jenkins/configmap-jcasc.yaml | 5 ++++- services/jenkins/deployment.yaml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 78d98fe..d4a29f1 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -18,7 +18,7 @@ data: logoutFromOpenIdProvider: true postLogoutRedirectUrl: "https://ci.bstein.dev" sendScopesInTokenRequest: true - rootURLFromRequest: true + rootURLFromRequest: false userNameField: "preferred_username" fullNameFieldName: "name" emailFieldName: "email" @@ -245,3 +245,6 @@ data: crumbIssuer: standard: excludeClientIPFromCrumb: true + unclassified: + location: + url: "https://ci.bstein.dev/" diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index dfbe5fe..fdb8d10 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -38,7 +38,7 @@ spec: GITEA_PAT_USERNAME={{ .Data.data.username }} GITEA_PAT_TOKEN={{ .Data.data.token }} {{- end -}} - bstein.dev/restarted-at: "2026-01-20T04:14:13Z" + bstein.dev/restarted-at: "2026-01-20T05:05:00Z" spec: serviceAccountName: jenkins nodeSelector: From ea6e60000752f6d1a265f1c85f8001d61d425c0f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 09:45:33 -0300 Subject: [PATCH 040/416] jenkins: drop removed multibranch plugin --- services/jenkins/configmap-jcasc.yaml | 33 +++++++++---------------- services/jenkins/configmap-plugins.yaml | 2 -- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index d4a29f1..9e116c0 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -158,33 +158,24 @@ data: } } } - multibranchPipelineJob('titan-iac-quality-gate') { - branchSources { - branchSource { - source { + pipelineJob('titan-iac-quality-gate') { + triggers { + scm('H/12 * * * *') + } + definition { + cpsScm { + scm { git { - id('titan-iac-quality-gate') - remote('https://scm.bstein.dev/bstein/titan-iac.git') - credentialsId('gitea-pat') + remote { + url('https://scm.bstein.dev/bstein/titan-iac.git') + credentials('gitea-pat') + } + branches('*/main') } } - } - } - factory { - workflowBranchProjectFactory { scriptPath('ci/Jenkinsfile.titan-iac') } } - orphanedItemStrategy { - discardOldItems { - numToKeep(30) - } - } - triggers { - periodicFolderTrigger { - interval('12h') - } - } } base.yaml: | jenkins: diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml index 108c646..d20a283 100644 --- a/services/jenkins/configmap-plugins.yaml +++ b/services/jenkins/configmap-plugins.yaml @@ -9,10 +9,8 @@ data: kubernetes workflow-aggregator git - git-branch-source pipeline-utility-steps configuration-as-code - configuration-as-code-support oic-auth job-dsl simple-theme-plugin From 8e9db51f9da4fdd8cb8701f4a098de1ba157cbde Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 10:15:33 -0300 Subject: [PATCH 041/416] jenkins: restore multibranch + webhook token --- services/jenkins/configmap-jcasc.yaml | 40 +++++++++++++++++-------- services/jenkins/configmap-plugins.yaml | 22 +++++++++----- services/jenkins/deployment.yaml | 17 ++++++----- 3 files changed, 52 insertions(+), 27 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 9e116c0..ca3a722 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -158,24 +158,40 @@ data: } } } - pipelineJob('titan-iac-quality-gate') { - triggers { - scm('H/12 * * * *') - } - definition { - cpsScm { - scm { + multibranchPipelineJob('titan-iac-quality-gate') { + branchSources { + branchSource { + source { git { - remote { - url('https://scm.bstein.dev/bstein/titan-iac.git') - credentials('gitea-pat') - } - branches('*/main') + id('titan-iac-quality-gate') + remote('https://scm.bstein.dev/bstein/titan-iac.git') + credentialsId('gitea-pat') } } + } + } + factory { + workflowBranchProjectFactory { scriptPath('ci/Jenkinsfile.titan-iac') } } + orphanedItemStrategy { + discardOldItems { + numToKeep(30) + } + } + triggers { + periodicFolderTrigger { + interval('12h') + } + } + configure { node -> + def token = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: '' + def triggers = node / 'triggers' + triggers << 'com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger' { + token(token) + } + } } base.yaml: | jenkins: diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml index d20a283..3529512 100644 --- a/services/jenkins/configmap-plugins.yaml +++ b/services/jenkins/configmap-plugins.yaml @@ -6,11 +6,17 @@ metadata: namespace: jenkins data: plugins.txt: | - kubernetes - workflow-aggregator - git - pipeline-utility-steps - configuration-as-code - oic-auth - job-dsl - simple-theme-plugin + kubernetes:4416.v2ea_b_5372da_a_e + workflow-aggregator:608.v67378e9d3db_1 + git:5.8.1 + pipeline-utility-steps:2.20.0 + configuration-as-code:2031.veb_a_fdda_b_3ffd + oic-auth:4.626.ve5a_d9f26c051 + job-dsl:1.93 + simple-theme-plugin:230.v8b_fd91b_b_800c + workflow-multibranch:821.vc3b_4ea_780798 + branch-api:2.1268.v044a_87612da_8 + scm-api:724.v7d839074eb_5c + gitea:268.v75e47974c01d + gitea-checks:603.621.vc708da_fb_371d + multibranch-scan-webhook-trigger:1.0.11 diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index fdb8d10..c82a6af 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -22,23 +22,26 @@ spec: vault.hashicorp.com/role: "jenkins" vault.hashicorp.com/agent-inject-secret-jenkins-env: "kv/data/atlas/jenkins/jenkins-oidc" vault.hashicorp.com/agent-inject-template-jenkins-env: | - {{- with secret "kv/data/atlas/jenkins/jenkins-oidc" -}} + {{ with secret "kv/data/atlas/jenkins/jenkins-oidc" }} OIDC_CLIENT_ID={{ .Data.data.clientId }} OIDC_CLIENT_SECRET={{ .Data.data.clientSecret }} OIDC_AUTH_URL={{ .Data.data.authorizationUrl }} OIDC_TOKEN_URL={{ .Data.data.tokenUrl }} OIDC_USERINFO_URL={{ .Data.data.userInfoUrl }} OIDC_LOGOUT_URL={{ .Data.data.logoutUrl }} - {{- end }} - {{- with secret "kv/data/atlas/jenkins/harbor-robot-creds" -}} + {{ end }} + {{ with secret "kv/data/atlas/jenkins/harbor-robot-creds" }} HARBOR_ROBOT_USERNAME={{ .Data.data.username }} HARBOR_ROBOT_PASSWORD={{ .Data.data.password }} - {{- end }} - {{- with secret "kv/data/atlas/jenkins/gitea-pat" -}} + {{ end }} + {{ with secret "kv/data/atlas/jenkins/gitea-pat" }} GITEA_PAT_USERNAME={{ .Data.data.username }} GITEA_PAT_TOKEN={{ .Data.data.token }} - {{- end -}} - bstein.dev/restarted-at: "2026-01-20T05:05:00Z" + {{ end }} + {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} + TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} + {{ end }} + bstein.dev/restarted-at: "2026-01-20T13:10:00Z" spec: serviceAccountName: jenkins nodeSelector: From fe30570b62fdaeb271254bd8192afbeb27805cf2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 10:23:08 -0300 Subject: [PATCH 042/416] jenkins: pin oic-auth for core 2.528.3 --- services/jenkins/configmap-plugins.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml index 3529512..1c43cfb 100644 --- a/services/jenkins/configmap-plugins.yaml +++ b/services/jenkins/configmap-plugins.yaml @@ -11,7 +11,7 @@ data: git:5.8.1 pipeline-utility-steps:2.20.0 configuration-as-code:2031.veb_a_fdda_b_3ffd - oic-auth:4.626.ve5a_d9f26c051 + oic-auth:4.609.v9de140f63d01 job-dsl:1.93 simple-theme-plugin:230.v8b_fd91b_b_800c workflow-multibranch:821.vc3b_4ea_780798 From b8d8240383235745fcaf6ed4b3e43494c7397200 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 10:31:30 -0300 Subject: [PATCH 043/416] jenkins: fix webhook trigger DSL --- services/jenkins/configmap-jcasc.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index ca3a722..7e6df31 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -186,11 +186,10 @@ data: } } configure { node -> - def token = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: '' + def webhookToken = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: '' def triggers = node / 'triggers' - triggers << 'com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger' { - token(token) - } + def webhook = triggers.appendNode('com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger') + webhook.appendNode('token', webhookToken) } } base.yaml: | From 36ae49f1fca427a1c2bebcae94d61f1584e9d77a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 10:37:57 -0300 Subject: [PATCH 044/416] jenkins: clean legacy quality-gate job --- services/jenkins/deployment.yaml | 2 +- services/jenkins/kustomization.yaml | 1 + services/jenkins/scripts/job_cleanup.groovy | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 services/jenkins/scripts/job_cleanup.groovy diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index c82a6af..c71812a 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -41,7 +41,7 @@ spec: {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T13:10:00Z" + bstein.dev/restarted-at: "2026-01-20T13:45:00Z" spec: serviceAccountName: jenkins nodeSelector: diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index acb6fb4..987e842 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -16,6 +16,7 @@ configMapGenerator: - name: jenkins-init-scripts namespace: jenkins files: + - job_cleanup.groovy=scripts/job_cleanup.groovy - theme.groovy=scripts/theme.groovy options: disableNameSuffixHash: true diff --git a/services/jenkins/scripts/job_cleanup.groovy b/services/jenkins/scripts/job_cleanup.groovy new file mode 100644 index 0000000..f123c6b --- /dev/null +++ b/services/jenkins/scripts/job_cleanup.groovy @@ -0,0 +1,13 @@ +import jenkins.branch.MultiBranchProject +import jenkins.model.Jenkins + +def jenkins = Jenkins.instance +if (jenkins == null) { + return +} + +def legacy = jenkins.getItemByFullName('titan-iac-quality-gate') +if (legacy != null && !(legacy instanceof MultiBranchProject)) { + legacy.delete() + println("Deleted legacy job titan-iac-quality-gate (non-multibranch)") +} From 9cdf244d98f1acfdc8feffd537a692230d88d0e0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 10:59:51 -0300 Subject: [PATCH 045/416] jenkins: drop legacy cleanup and update triggers --- services/jenkins/configmap-jcasc.yaml | 40 +++++++++++++++------ services/jenkins/deployment.yaml | 2 +- services/jenkins/kustomization.yaml | 1 - services/jenkins/scripts/job_cleanup.groovy | 13 ------- 4 files changed, 31 insertions(+), 25 deletions(-) delete mode 100644 services/jenkins/scripts/job_cleanup.groovy diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 7e6df31..ba0ac81 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -49,8 +49,12 @@ data: jobs: - script: | pipelineJob('harbor-arm-build') { - triggers { - scm('H/5 * * * *') + properties { + pipelineTriggers { + triggers { + scm('H/5 * * * *') + } + } } definition { cpsScm { @@ -83,8 +87,12 @@ data: } } pipelineJob('ci-demo') { - triggers { - scm('H/1 * * * *') + properties { + pipelineTriggers { + triggers { + scm('H/1 * * * *') + } + } } definition { cpsScm { @@ -102,8 +110,12 @@ data: } } pipelineJob('bstein-dev-home') { - triggers { - scm('H/2 * * * *') + properties { + pipelineTriggers { + triggers { + scm('H/2 * * * *') + } + } } definition { cpsScm { @@ -121,8 +133,12 @@ data: } } pipelineJob('ariadne') { - triggers { - scm('H/2 * * * *') + properties { + pipelineTriggers { + triggers { + scm('H/2 * * * *') + } + } } definition { cpsScm { @@ -140,8 +156,12 @@ data: } } pipelineJob('data-prepper') { - triggers { - scm('H/5 * * * *') + properties { + pipelineTriggers { + triggers { + scm('H/5 * * * *') + } + } } definition { cpsScm { diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index c71812a..9e83686 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -41,7 +41,7 @@ spec: {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T13:45:00Z" + bstein.dev/restarted-at: "2026-01-20T14:05:00Z" spec: serviceAccountName: jenkins nodeSelector: diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index 987e842..acb6fb4 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -16,7 +16,6 @@ configMapGenerator: - name: jenkins-init-scripts namespace: jenkins files: - - job_cleanup.groovy=scripts/job_cleanup.groovy - theme.groovy=scripts/theme.groovy options: disableNameSuffixHash: true diff --git a/services/jenkins/scripts/job_cleanup.groovy b/services/jenkins/scripts/job_cleanup.groovy deleted file mode 100644 index f123c6b..0000000 --- a/services/jenkins/scripts/job_cleanup.groovy +++ /dev/null @@ -1,13 +0,0 @@ -import jenkins.branch.MultiBranchProject -import jenkins.model.Jenkins - -def jenkins = Jenkins.instance -if (jenkins == null) { - return -} - -def legacy = jenkins.getItemByFullName('titan-iac-quality-gate') -if (legacy != null && !(legacy instanceof MultiBranchProject)) { - legacy.delete() - println("Deleted legacy job titan-iac-quality-gate (non-multibranch)") -} From 0d3c5eb97606af4b03ae6a71a334d65a3e30195d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 11:07:54 -0300 Subject: [PATCH 046/416] jenkins: use pollSCM for pipeline triggers --- services/jenkins/configmap-jcasc.yaml | 10 +++++----- services/jenkins/deployment.yaml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index ba0ac81..71826ff 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -52,7 +52,7 @@ data: properties { pipelineTriggers { triggers { - scm('H/5 * * * *') + pollSCM('H/5 * * * *') } } } @@ -90,7 +90,7 @@ data: properties { pipelineTriggers { triggers { - scm('H/1 * * * *') + pollSCM('H/1 * * * *') } } } @@ -113,7 +113,7 @@ data: properties { pipelineTriggers { triggers { - scm('H/2 * * * *') + pollSCM('H/2 * * * *') } } } @@ -136,7 +136,7 @@ data: properties { pipelineTriggers { triggers { - scm('H/2 * * * *') + pollSCM('H/2 * * * *') } } } @@ -159,7 +159,7 @@ data: properties { pipelineTriggers { triggers { - scm('H/5 * * * *') + pollSCM('H/5 * * * *') } } } diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 9e83686..cab3621 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -41,7 +41,7 @@ spec: {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T14:05:00Z" + bstein.dev/restarted-at: "2026-01-20T14:15:00Z" spec: serviceAccountName: jenkins nodeSelector: From 9f6824ad569cac0f39af68e209da9d1427d2e60f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 11:14:29 -0300 Subject: [PATCH 047/416] jenkins: use scmTrigger for pipeline polls --- services/jenkins/configmap-jcasc.yaml | 25 ++++++++++++++++++++----- services/jenkins/deployment.yaml | 2 +- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 71826ff..aa279e9 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -52,7 +52,10 @@ data: properties { pipelineTriggers { triggers { - pollSCM('H/5 * * * *') + scmTrigger { + spec('H/5 * * * *') + ignorePostCommitHooks(false) + } } } } @@ -90,7 +93,10 @@ data: properties { pipelineTriggers { triggers { - pollSCM('H/1 * * * *') + scmTrigger { + spec('H/1 * * * *') + ignorePostCommitHooks(false) + } } } } @@ -113,7 +119,10 @@ data: properties { pipelineTriggers { triggers { - pollSCM('H/2 * * * *') + scmTrigger { + spec('H/2 * * * *') + ignorePostCommitHooks(false) + } } } } @@ -136,7 +145,10 @@ data: properties { pipelineTriggers { triggers { - pollSCM('H/2 * * * *') + scmTrigger { + spec('H/2 * * * *') + ignorePostCommitHooks(false) + } } } } @@ -159,7 +171,10 @@ data: properties { pipelineTriggers { triggers { - pollSCM('H/5 * * * *') + scmTrigger { + spec('H/5 * * * *') + ignorePostCommitHooks(false) + } } } } diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index cab3621..7706807 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -41,7 +41,7 @@ spec: {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T14:15:00Z" + bstein.dev/restarted-at: "2026-01-20T14:25:00Z" spec: serviceAccountName: jenkins nodeSelector: From b54da8e3e0b2c08e57a498e0665a1d7bf4c9eb68 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 11:23:06 -0300 Subject: [PATCH 048/416] jenkins: fix scmTrigger spec field --- services/jenkins/configmap-jcasc.yaml | 10 +++++----- services/jenkins/deployment.yaml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index aa279e9..e29c143 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -53,7 +53,7 @@ data: pipelineTriggers { triggers { scmTrigger { - spec('H/5 * * * *') + scmpoll_spec('H/5 * * * *') ignorePostCommitHooks(false) } } @@ -94,7 +94,7 @@ data: pipelineTriggers { triggers { scmTrigger { - spec('H/1 * * * *') + scmpoll_spec('H/1 * * * *') ignorePostCommitHooks(false) } } @@ -120,7 +120,7 @@ data: pipelineTriggers { triggers { scmTrigger { - spec('H/2 * * * *') + scmpoll_spec('H/2 * * * *') ignorePostCommitHooks(false) } } @@ -146,7 +146,7 @@ data: pipelineTriggers { triggers { scmTrigger { - spec('H/2 * * * *') + scmpoll_spec('H/2 * * * *') ignorePostCommitHooks(false) } } @@ -172,7 +172,7 @@ data: pipelineTriggers { triggers { scmTrigger { - spec('H/5 * * * *') + scmpoll_spec('H/5 * * * *') ignorePostCommitHooks(false) } } diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 7706807..4492579 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -41,7 +41,7 @@ spec: {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T14:25:00Z" + bstein.dev/restarted-at: "2026-01-20T14:35:00Z" spec: serviceAccountName: jenkins nodeSelector: From f5eec19e112758aaf351892775d171602be9281d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 11:54:15 -0300 Subject: [PATCH 049/416] jenkins: automate notifyCommit token --- services/jenkins/deployment.yaml | 3 +- services/jenkins/kustomization.yaml | 1 + .../jenkins/scripts/git-notify-token.groovy | 41 +++++++++++++++++++ services/jenkins/scripts/theme.groovy | 1 - 4 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 services/jenkins/scripts/git-notify-token.groovy diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 4492579..b5b3de6 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -40,8 +40,9 @@ spec: {{ end }} {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} + GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T14:35:00Z" + bstein.dev/restarted-at: "2026-01-20T14:52:41Z" spec: serviceAccountName: jenkins nodeSelector: diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index acb6fb4..0a03f5b 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -16,6 +16,7 @@ configMapGenerator: - name: jenkins-init-scripts namespace: jenkins files: + - git-notify-token.groovy=scripts/git-notify-token.groovy - theme.groovy=scripts/theme.groovy options: disableNameSuffixHash: true diff --git a/services/jenkins/scripts/git-notify-token.groovy b/services/jenkins/scripts/git-notify-token.groovy new file mode 100644 index 0000000..336c918 --- /dev/null +++ b/services/jenkins/scripts/git-notify-token.groovy @@ -0,0 +1,41 @@ +import hudson.plugins.git.ApiTokenPropertyConfiguration +import hudson.Util +import java.nio.charset.StandardCharsets +import java.security.MessageDigest + + +def entries = [ + [env: 'GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME', name: 'gitea-bstein-dev-home'], +] + +entries.each { entry -> + def token = System.getenv(entry.env) + if (!token || token.trim().isEmpty()) { + println("Git notifyCommit token ${entry.env} missing; skipping") + return + } + + try { + def config = ApiTokenPropertyConfiguration.get() + if (config.hasMatchingApiToken(token)) { + println("Git notifyCommit token ${entry.name} already configured") + return + } + + def digest = MessageDigest.getInstance("SHA-256") + def hash = Util.toHexString(digest.digest(token.getBytes(StandardCharsets.US_ASCII))) + + def field = ApiTokenPropertyConfiguration.class.getDeclaredField("apiTokens") + field.setAccessible(true) + def tokens = field.get(config) + + def ctor = ApiTokenPropertyConfiguration.HashedApiToken.class.getDeclaredConstructor(String.class, String.class) + ctor.setAccessible(true) + tokens.add(ctor.newInstance(entry.name, hash)) + config.save() + + println("Added git notifyCommit access token ${entry.name}") + } catch (Throwable e) { + println("Failed to configure git notifyCommit token ${entry.name}: ${e.class.simpleName}: ${e.message}") + } +} diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy index cf171f7..5950bf4 100644 --- a/services/jenkins/scripts/theme.groovy +++ b/services/jenkins/scripts/theme.groovy @@ -8,7 +8,6 @@ if (decorators?.size() > 0) { def theme = decorators[0] theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css") theme.setJsUrl("") - theme.setTheme("") instance.save() println("Applied simple-theme-plugin dark theme") } else { From c80f26625d1820a982621663d146da5201a2fde0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:04:24 -0300 Subject: [PATCH 050/416] jenkins: move agent workspace off node disk --- services/jenkins/cache-pvc.yaml | 13 +++++++++++++ services/jenkins/configmap-jcasc.yaml | 5 +++++ services/jenkins/deployment.yaml | 6 ++++-- services/jenkins/kustomization.yaml | 2 ++ services/jenkins/plugins-pvc.yaml | 13 +++++++++++++ 5 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 services/jenkins/cache-pvc.yaml create mode 100644 services/jenkins/plugins-pvc.yaml diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml new file mode 100644 index 0000000..784c7d8 --- /dev/null +++ b/services/jenkins/cache-pvc.yaml @@ -0,0 +1,13 @@ +# services/jenkins/cache-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: jenkins-cache + namespace: jenkins +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + storageClassName: astreae diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index e29c143..f485de8 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -258,6 +258,11 @@ data: templates: - name: "default" namespace: "jenkins" + workspaceVolume: + dynamicPVC: + accessModes: "ReadWriteOnce" + requestsSize: "5Gi" + storageClassName: "astreae" containers: - name: "jnlp" args: "^${computer.jnlpmac} ^${computer.name}" diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index b5b3de6..7ee1aad 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -161,9 +161,11 @@ spec: persistentVolumeClaim: claimName: jenkins - name: jenkins-cache - emptyDir: {} + persistentVolumeClaim: + claimName: jenkins-cache - name: plugin-dir - emptyDir: {} + persistentVolumeClaim: + claimName: jenkins-plugins - name: plugins configMap: name: jenkins-plugins diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index 0a03f5b..aab859a 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -6,6 +6,8 @@ resources: - namespace.yaml - serviceaccount.yaml - pvc.yaml + - cache-pvc.yaml + - plugins-pvc.yaml - configmap-jcasc.yaml - configmap-plugins.yaml - deployment.yaml diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml new file mode 100644 index 0000000..45a967b --- /dev/null +++ b/services/jenkins/plugins-pvc.yaml @@ -0,0 +1,13 @@ +# services/jenkins/plugins-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: jenkins-plugins + namespace: jenkins +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + storageClassName: astreae From 9ac66919d54f4cdca2700d078c697340c404f891 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:09:23 -0300 Subject: [PATCH 051/416] jenkins: expand pvc sizes and move /tmp to memory --- services/jenkins/cache-pvc.yaml | 2 +- services/jenkins/configmap-jcasc.yaml | 2 +- services/jenkins/deployment.yaml | 3 ++- services/jenkins/plugins-pvc.yaml | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml index 784c7d8..7538305 100644 --- a/services/jenkins/cache-pvc.yaml +++ b/services/jenkins/cache-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 5Gi + storage: 50Gi storageClassName: astreae diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index f485de8..5ee6a3e 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -261,7 +261,7 @@ data: workspaceVolume: dynamicPVC: accessModes: "ReadWriteOnce" - requestsSize: "5Gi" + requestsSize: "50Gi" storageClassName: "astreae" containers: - name: "jnlp" diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 7ee1aad..5f50084 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -176,4 +176,5 @@ spec: configMap: name: jenkins-init-scripts - name: tmp - emptyDir: {} + emptyDir: + medium: Memory diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml index 45a967b..2812c7a 100644 --- a/services/jenkins/plugins-pvc.yaml +++ b/services/jenkins/plugins-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 2Gi + storage: 20Gi storageClassName: astreae From 5c40efdbcc3499ac4c355df7d5277dcdde734aec Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:19:58 -0300 Subject: [PATCH 052/416] jenkins: right-size pvc requests --- services/jenkins/cache-pvc.yaml | 2 +- services/jenkins/configmap-jcasc.yaml | 2 +- services/jenkins/plugins-pvc.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml index 7538305..79e8dec 100644 --- a/services/jenkins/cache-pvc.yaml +++ b/services/jenkins/cache-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 50Gi + storage: 20Gi storageClassName: astreae diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 5ee6a3e..c2144fa 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -261,7 +261,7 @@ data: workspaceVolume: dynamicPVC: accessModes: "ReadWriteOnce" - requestsSize: "50Gi" + requestsSize: "20Gi" storageClassName: "astreae" containers: - name: "jnlp" diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml index 2812c7a..e26d07f 100644 --- a/services/jenkins/plugins-pvc.yaml +++ b/services/jenkins/plugins-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 20Gi + storage: 10Gi storageClassName: astreae From 1522b7a0195600c077086da2f07d73d563d0f181 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:21:42 -0300 Subject: [PATCH 053/416] jenkins: keep cache/plugin pvc sizes to avoid shrink --- services/jenkins/cache-pvc.yaml | 2 +- services/jenkins/plugins-pvc.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml index 79e8dec..7538305 100644 --- a/services/jenkins/cache-pvc.yaml +++ b/services/jenkins/cache-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 20Gi + storage: 50Gi storageClassName: astreae diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml index e26d07f..2812c7a 100644 --- a/services/jenkins/plugins-pvc.yaml +++ b/services/jenkins/plugins-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 10Gi + storage: 20Gi storageClassName: astreae From 13891e794ac7631da002e4cb5aae68ba126bd32f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:32:27 -0300 Subject: [PATCH 054/416] jenkins: rotate cache/plugin pvcs --- services/jenkins/cache-pvc.yaml | 4 ++-- services/jenkins/deployment.yaml | 4 ++-- services/jenkins/plugins-pvc.yaml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml index 7538305..a9ed319 100644 --- a/services/jenkins/cache-pvc.yaml +++ b/services/jenkins/cache-pvc.yaml @@ -2,12 +2,12 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: jenkins-cache + name: jenkins-cache-v2 namespace: jenkins spec: accessModes: - ReadWriteOnce resources: requests: - storage: 50Gi + storage: 20Gi storageClassName: astreae diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 5f50084..9f8fe99 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -162,10 +162,10 @@ spec: claimName: jenkins - name: jenkins-cache persistentVolumeClaim: - claimName: jenkins-cache + claimName: jenkins-cache-v2 - name: plugin-dir persistentVolumeClaim: - claimName: jenkins-plugins + claimName: jenkins-plugins-v2 - name: plugins configMap: name: jenkins-plugins diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml index 2812c7a..06715eb 100644 --- a/services/jenkins/plugins-pvc.yaml +++ b/services/jenkins/plugins-pvc.yaml @@ -2,12 +2,12 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: jenkins-plugins + name: jenkins-plugins-v2 namespace: jenkins spec: accessModes: - ReadWriteOnce resources: requests: - storage: 20Gi + storage: 10Gi storageClassName: astreae From 6db7521114e167185775927fb72e5fe80ed37106 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:43:23 -0300 Subject: [PATCH 055/416] jenkins: add local dark theme css --- services/jenkins/deployment.yaml | 4 + services/jenkins/kustomization.yaml | 1 + services/jenkins/scripts/jenkins-theme.css | 97 ++++++++++++++++++++++ services/jenkins/scripts/theme.groovy | 2 +- 4 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 services/jenkins/scripts/jenkins-theme.css diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 9f8fe99..b69f134 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -94,6 +94,7 @@ spec: - -c - | set -e + mkdir -p /var/jenkins_home/userContent exec env $(cat /vault/secrets/jenkins-env) /usr/bin/tini -- /usr/local/bin/jenkins.sh ports: - name: http @@ -152,6 +153,9 @@ spec: mountPath: /config/jcasc - name: init-scripts mountPath: /usr/share/jenkins/ref/init.groovy.d + - name: init-scripts + mountPath: /var/jenkins_home/userContent/jenkins-theme.css + subPath: jenkins-theme.css - name: plugin-dir mountPath: /usr/share/jenkins/ref/plugins - name: tmp diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index aab859a..444dd6d 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -18,6 +18,7 @@ configMapGenerator: - name: jenkins-init-scripts namespace: jenkins files: + - jenkins-theme.css=scripts/jenkins-theme.css - git-notify-token.groovy=scripts/git-notify-token.groovy - theme.groovy=scripts/theme.groovy options: diff --git a/services/jenkins/scripts/jenkins-theme.css b/services/jenkins/scripts/jenkins-theme.css new file mode 100644 index 0000000..56fe193 --- /dev/null +++ b/services/jenkins/scripts/jenkins-theme.css @@ -0,0 +1,97 @@ +@import url("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css"); + +:root { + --atlas-bg: #0f1216; + --atlas-surface: #171b21; + --atlas-surface-alt: #1f252d; + --atlas-border: #2b313b; + --atlas-text: #e6e9ef; + --atlas-text-muted: #b3bac6; + --atlas-link: #8fb7ff; +} + +body, +#page-body, +#page-header, +#header, +#main-panel, +#main-panel-content, +#side-panel, +.top-sticker-inner, +.bottom-sticker-inner, +#breadcrumbBar, +#breadcrumbs { + background-color: var(--atlas-bg) !important; + color: var(--atlas-text) !important; +} + +#side-panel .task-link, +#breadcrumbs a, +#breadcrumbs, +#projectstatus th a, +#projectstatus td, +#projectstatus th { + color: var(--atlas-text-muted) !important; +} + +a, +a:visited, +a:link { + color: var(--atlas-link) !important; +} + +a:hover { + opacity: 0.85; +} + +#main-panel, +#main-panel-content, +#description, +.pane, +table.pane { + background-color: var(--atlas-surface) !important; + color: var(--atlas-text) !important; +} + +table.pane tr:nth-child(odd) td { + background-color: var(--atlas-surface) !important; +} + +table.pane tr:nth-child(even) td, +#projectstatus tr:hover td { + background-color: var(--atlas-surface-alt) !important; +} + +input, +select, +textarea, +#search-box { + background-color: var(--atlas-surface-alt) !important; + color: var(--atlas-text) !important; + border-color: var(--atlas-border) !important; +} + +#header, +#page-header { + background-color: #202734 !important; +} + +#header .login, +#page-header .login { + color: var(--atlas-text) !important; +} + +#side-panel .task-link, +#side-panel .task-link:visited, +#side-panel .task-link:hover { + color: var(--atlas-text) !important; +} + +#footer { + background-color: var(--atlas-bg) !important; + color: var(--atlas-text-muted) !important; +} + +.jenkins_ver:after { + content: "atlas dark"; +} diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy index 5950bf4..fd12474 100644 --- a/services/jenkins/scripts/theme.groovy +++ b/services/jenkins/scripts/theme.groovy @@ -6,7 +6,7 @@ def decorators = instance.getExtensionList(SimpleThemeDecorator.class) if (decorators?.size() > 0) { def theme = decorators[0] - theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css") + theme.setCssUrl("https://ci.bstein.dev/userContent/jenkins-theme.css") theme.setJsUrl("") instance.save() println("Applied simple-theme-plugin dark theme") From 954d0d36b96c73ba7aea77771ad5e7512229faa0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:54:47 -0300 Subject: [PATCH 056/416] jenkins: mount init scripts into home --- services/jenkins/deployment.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index b69f134..7dff5af 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -153,6 +153,8 @@ spec: mountPath: /config/jcasc - name: init-scripts mountPath: /usr/share/jenkins/ref/init.groovy.d + - name: init-scripts + mountPath: /var/jenkins_home/init.groovy.d - name: init-scripts mountPath: /var/jenkins_home/userContent/jenkins-theme.css subPath: jenkins-theme.css From 163f98c5940a2784b7a52e17e03d0782c6c9a243 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 18:00:36 -0300 Subject: [PATCH 057/416] jenkins: inline dark theme css --- services/jenkins/deployment.yaml | 4 - services/jenkins/kustomization.yaml | 1 - services/jenkins/scripts/jenkins-theme.css | 97 --------------------- services/jenkins/scripts/theme.groovy | 99 +++++++++++++++++++++- 4 files changed, 98 insertions(+), 103 deletions(-) delete mode 100644 services/jenkins/scripts/jenkins-theme.css diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 7dff5af..0b62ee0 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -94,7 +94,6 @@ spec: - -c - | set -e - mkdir -p /var/jenkins_home/userContent exec env $(cat /vault/secrets/jenkins-env) /usr/bin/tini -- /usr/local/bin/jenkins.sh ports: - name: http @@ -155,9 +154,6 @@ spec: mountPath: /usr/share/jenkins/ref/init.groovy.d - name: init-scripts mountPath: /var/jenkins_home/init.groovy.d - - name: init-scripts - mountPath: /var/jenkins_home/userContent/jenkins-theme.css - subPath: jenkins-theme.css - name: plugin-dir mountPath: /usr/share/jenkins/ref/plugins - name: tmp diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index 444dd6d..aab859a 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -18,7 +18,6 @@ configMapGenerator: - name: jenkins-init-scripts namespace: jenkins files: - - jenkins-theme.css=scripts/jenkins-theme.css - git-notify-token.groovy=scripts/git-notify-token.groovy - theme.groovy=scripts/theme.groovy options: diff --git a/services/jenkins/scripts/jenkins-theme.css b/services/jenkins/scripts/jenkins-theme.css deleted file mode 100644 index 56fe193..0000000 --- a/services/jenkins/scripts/jenkins-theme.css +++ /dev/null @@ -1,97 +0,0 @@ -@import url("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css"); - -:root { - --atlas-bg: #0f1216; - --atlas-surface: #171b21; - --atlas-surface-alt: #1f252d; - --atlas-border: #2b313b; - --atlas-text: #e6e9ef; - --atlas-text-muted: #b3bac6; - --atlas-link: #8fb7ff; -} - -body, -#page-body, -#page-header, -#header, -#main-panel, -#main-panel-content, -#side-panel, -.top-sticker-inner, -.bottom-sticker-inner, -#breadcrumbBar, -#breadcrumbs { - background-color: var(--atlas-bg) !important; - color: var(--atlas-text) !important; -} - -#side-panel .task-link, -#breadcrumbs a, -#breadcrumbs, -#projectstatus th a, -#projectstatus td, -#projectstatus th { - color: var(--atlas-text-muted) !important; -} - -a, -a:visited, -a:link { - color: var(--atlas-link) !important; -} - -a:hover { - opacity: 0.85; -} - -#main-panel, -#main-panel-content, -#description, -.pane, -table.pane { - background-color: var(--atlas-surface) !important; - color: var(--atlas-text) !important; -} - -table.pane tr:nth-child(odd) td { - background-color: var(--atlas-surface) !important; -} - -table.pane tr:nth-child(even) td, -#projectstatus tr:hover td { - background-color: var(--atlas-surface-alt) !important; -} - -input, -select, -textarea, -#search-box { - background-color: var(--atlas-surface-alt) !important; - color: var(--atlas-text) !important; - border-color: var(--atlas-border) !important; -} - -#header, -#page-header { - background-color: #202734 !important; -} - -#header .login, -#page-header .login { - color: var(--atlas-text) !important; -} - -#side-panel .task-link, -#side-panel .task-link:visited, -#side-panel .task-link:hover { - color: var(--atlas-text) !important; -} - -#footer { - background-color: var(--atlas-bg) !important; - color: var(--atlas-text-muted) !important; -} - -.jenkins_ver:after { - content: "atlas dark"; -} diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy index fd12474..b20169c 100644 --- a/services/jenkins/scripts/theme.groovy +++ b/services/jenkins/scripts/theme.groovy @@ -6,7 +6,104 @@ def decorators = instance.getExtensionList(SimpleThemeDecorator.class) if (decorators?.size() > 0) { def theme = decorators[0] - theme.setCssUrl("https://ci.bstein.dev/userContent/jenkins-theme.css") + theme.setCssUrl("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css") + theme.setCssRules(""" +:root { + --atlas-bg: #0f1216; + --atlas-surface: #171b21; + --atlas-surface-alt: #1f252d; + --atlas-border: #2b313b; + --atlas-text: #e6e9ef; + --atlas-text-muted: #b3bac6; + --atlas-link: #8fb7ff; +} + +body, +#page-body, +#page-header, +#header, +#main-panel, +#main-panel-content, +#side-panel, +.top-sticker-inner, +.bottom-sticker-inner, +#breadcrumbBar, +#breadcrumbs { + background-color: var(--atlas-bg) !important; + color: var(--atlas-text) !important; +} + +#side-panel .task-link, +#breadcrumbs a, +#breadcrumbs, +#projectstatus th a, +#projectstatus td, +#projectstatus th { + color: var(--atlas-text-muted) !important; +} + +a, +a:visited, +a:link { + color: var(--atlas-link) !important; +} + +a:hover { + opacity: 0.85; +} + +#main-panel, +#main-panel-content, +#description, +.pane, +table.pane { + background-color: var(--atlas-surface) !important; + color: var(--atlas-text) !important; +} + +table.pane tr:nth-child(odd) td { + background-color: var(--atlas-surface) !important; +} + +table.pane tr:nth-child(even) td, +#projectstatus tr:hover td { + background-color: var(--atlas-surface-alt) !important; +} + +input, +select, +textarea, +#search-box { + background-color: var(--atlas-surface-alt) !important; + color: var(--atlas-text) !important; + border-color: var(--atlas-border) !important; +} + +#header, +#page-header { + background-color: #202734 !important; +} + +#header .login, +#page-header .login { + color: var(--atlas-text) !important; +} + +#side-panel .task-link, +#side-panel .task-link:visited, +#side-panel .task-link:hover { + color: var(--atlas-text) !important; +} + +#footer { + background-color: var(--atlas-bg) !important; + color: var(--atlas-text-muted) !important; +} + +.jenkins_ver:after { + content: "atlas dark"; +} +""".stripIndent().trim()) theme.setJsUrl("") instance.save() println("Applied simple-theme-plugin dark theme") From c846d2c1ba20543e4b38bcd08048faf4e57dac02 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 18:11:13 -0300 Subject: [PATCH 058/416] ci: add root Jenkinsfile and update keycloak ldap job --- Jenkinsfile | 77 ++++++++++++++++++++++ services/keycloak/ldap-federation-job.yaml | 50 +++++++++++++- 2 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 Jenkinsfile diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..4d6b23e --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,77 @@ +// Mirror of ci/Jenkinsfile.titan-iac for multibranch discovery. +pipeline { + agent { + kubernetes { + defaultContainer 'python' + yaml """ +apiVersion: v1 +kind: Pod +spec: + nodeSelector: + hardware: rpi5 + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: python + image: python:3.12-slim + command: + - cat + tty: true +""" + } + } + environment { + PIP_DISABLE_PIP_VERSION_CHECK = '1' + PYTHONUNBUFFERED = '1' + } + stages { + stage('Checkout') { + steps { + checkout scm + } + } + stage('Install deps') { + steps { + sh 'pip install --no-cache-dir -r ci/requirements.txt' + } + } + stage('Glue tests') { + steps { + sh 'pytest -q ci/tests/glue' + } + } + stage('Resolve Flux branch') { + steps { + script { + env.FLUX_BRANCH = sh( + returnStdout: true, + script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml" + ).trim() + if (!env.FLUX_BRANCH) { + error('Flux branch not found in gotk-sync.yaml') + } + echo "Flux branch: ${env.FLUX_BRANCH}" + } + } + } + stage('Promote') { + when { + expression { + def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '') + return env.FLUX_BRANCH && branch == env.FLUX_BRANCH + } + } + steps { + withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) { + sh ''' + set +x + git config user.email "jenkins@bstein.dev" + git config user.name "jenkins" + git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git + git push origin HEAD:${FLUX_BRANCH} + ''' + } + } + } + } +} diff --git a/services/keycloak/ldap-federation-job.yaml b/services/keycloak/ldap-federation-job.yaml index 303fd9f..3c3f1c1 100644 --- a/services/keycloak/ldap-federation-job.yaml +++ b/services/keycloak/ldap-federation-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: keycloak-ldap-federation-11 + name: keycloak-ldap-federation-12 namespace: sso spec: backoffLimit: 2 @@ -325,6 +325,54 @@ spec: if status not in (201, 204): raise SystemExit(f"Unexpected group mapper create status: {status}") + def ensure_user_attr_mapper(name: str, ldap_attr: str, user_attr: str): + mapper = None + for c in components: + if c.get("name") == name and c.get("parentId") == ldap_component_id: + mapper = c + break + + payload = { + "name": name, + "providerId": "user-attribute-ldap-mapper", + "providerType": "org.keycloak.storage.ldap.mappers.LDAPStorageMapper", + "parentId": ldap_component_id, + "config": { + "ldap.attribute": [ldap_attr], + "user.model.attribute": [user_attr], + "read.only": ["false"], + "always.read.value.from.ldap": ["false"], + "is.mandatory.in.ldap": ["false"], + }, + } + + if mapper: + payload["id"] = mapper["id"] + payload["parentId"] = mapper.get("parentId", payload["parentId"]) + print(f"Updating LDAP user mapper: {payload['id']} ({name})") + status, _, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/components/{payload['id']}", + token, + payload, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected user mapper update status for {name}: {status}") + else: + print(f"Creating LDAP user mapper: {name}") + status, _, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/components", + token, + payload, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected user mapper create status for {name}: {status}") + + ensure_user_attr_mapper("openldap-email", "mail", "email") + ensure_user_attr_mapper("openldap-first-name", "givenName", "firstName") + ensure_user_attr_mapper("openldap-last-name", "sn", "lastName") + # Cleanup duplicate LDAP federation providers and their child components (mappers, etc). # Keep only the canonical provider we updated/created above. try: From 0bb45bca838ca082cffdd299ebfce52c081d97c9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 18:13:49 -0300 Subject: [PATCH 059/416] jenkins: fix dark theme injection --- services/jenkins/scripts/theme.groovy | 140 +++++++++++++++----------- 1 file changed, 83 insertions(+), 57 deletions(-) diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy index b20169c..58755c0 100644 --- a/services/jenkins/scripts/theme.groovy +++ b/services/jenkins/scripts/theme.groovy @@ -1,21 +1,46 @@ import jenkins.model.Jenkins import org.codefirst.SimpleThemeDecorator +import org.jenkinsci.plugins.simpletheme.CssTextThemeElement def instance = Jenkins.get() def decorators = instance.getExtensionList(SimpleThemeDecorator.class) if (decorators?.size() > 0) { def theme = decorators[0] - theme.setCssUrl("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css") - theme.setCssRules(""" -:root { - --atlas-bg: #0f1216; - --atlas-surface: #171b21; - --atlas-surface-alt: #1f252d; - --atlas-border: #2b313b; - --atlas-text: #e6e9ef; - --atlas-text-muted: #b3bac6; - --atlas-link: #8fb7ff; + def cssRules = """ +:root, +.app-theme-picker__picker[data-theme=none] { + --background: #0f1216 !important; + --header-background: #141922 !important; + --header-border: #2b313b !important; + --white: #141922 !important; + --black: #e6e9ef !important; + --very-light-grey: #171b21 !important; + --light-grey: #202734 !important; + --medium-grey: #2b313b !important; + --dark-grey: #0b0f14 !important; + --text-color: #e6e9ef !important; + --text-color-secondary: #a6adba !important; + --card-background: #171b21 !important; + --card-border-color: #2b313b !important; + --pane-header-bg: #1f252d !important; + --pane-header-border-color: #2b313b !important; + --pane-border-color: #2b313b !important; + --pane-text-color: #e6e9ef !important; + --pane-header-text-color: #e6e9ef !important; + --link-color: #8fb7ff !important; + --link-color--hover: #b0ccff !important; + --link-dark-color: #e6e9ef !important; + --link-dark-color--hover: #b0ccff !important; + --input-color: #151a20 !important; + --input-border: #2b313b !important; + --input-border-hover: #3a424d !important; + --button-background: #232a33 !important; + --button-background--hover: #2b313b !important; + --button-background--active: #323b46 !important; + --item-background--hover: #232a33 !important; + --item-background--active: #2b313b !important; + --accent-color: #8fb7ff !important; } body, @@ -29,83 +54,84 @@ body, .bottom-sticker-inner, #breadcrumbBar, #breadcrumbs { - background-color: var(--atlas-bg) !important; - color: var(--atlas-text) !important; + background-color: var(--background) !important; + color: var(--text-color) !important; } -#side-panel .task-link, -#breadcrumbs a, -#breadcrumbs, -#projectstatus th a, +.jenkins-card, +.jenkins-section, +.jenkins-section__item, +#main-panel .jenkins-card, +#main-panel .jenkins-section { + background-color: var(--card-background) !important; + color: var(--text-color) !important; + border-color: var(--card-border-color) !important; +} + +table.pane, +table.pane td, +table.pane th, #projectstatus td, #projectstatus th { - color: var(--atlas-text-muted) !important; -} - -a, -a:visited, -a:link { - color: var(--atlas-link) !important; -} - -a:hover { - opacity: 0.85; -} - -#main-panel, -#main-panel-content, -#description, -.pane, -table.pane { - background-color: var(--atlas-surface) !important; - color: var(--atlas-text) !important; -} - -table.pane tr:nth-child(odd) td { - background-color: var(--atlas-surface) !important; + background-color: var(--card-background) !important; + color: var(--text-color) !important; } table.pane tr:nth-child(even) td, #projectstatus tr:hover td { - background-color: var(--atlas-surface-alt) !important; + background-color: #1f252d !important; } input, select, textarea, #search-box { - background-color: var(--atlas-surface-alt) !important; - color: var(--atlas-text) !important; - border-color: var(--atlas-border) !important; + background-color: #151a20 !important; + color: var(--text-color) !important; + border-color: var(--input-border) !important; } -#header, -#page-header { - background-color: #202734 !important; +a, +a:visited, +a:link { + color: var(--link-color) !important; } -#header .login, -#page-header .login { - color: var(--atlas-text) !important; +a:hover { + opacity: 0.85; } #side-panel .task-link, -#side-panel .task-link:visited, -#side-panel .task-link:hover { - color: var(--atlas-text) !important; +#breadcrumbs a, +#breadcrumbs, +#projectstatus th a { + color: var(--text-color-secondary) !important; +} + +.console-output, +.console-output pre, +pre, +code, +.CodeMirror { + background-color: #0c0f14 !important; + color: #d9dee7 !important; } #footer { - background-color: var(--atlas-bg) !important; - color: var(--atlas-text-muted) !important; + background-color: var(--background) !important; + color: var(--text-color-secondary) !important; } .jenkins_ver:after { content: "atlas dark"; } -""".stripIndent().trim()) +""".stripIndent().trim() + + theme.setElements([new CssTextThemeElement(cssRules)]) + theme.setCssUrl("") + theme.setCssRules(cssRules) theme.setJsUrl("") - instance.save() + theme.save() println("Applied simple-theme-plugin dark theme") } else { println("simple-theme-plugin not installed; skipping theme configuration") From 1fedb5ecbecd7eea132ce3f1a2beba326c5f89ec Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 23:03:39 -0300 Subject: [PATCH 060/416] maintenance: wire ariadne db and dashboards --- scripts/dashboards_render_atlas.py | 25 +++ services/maintenance/ariadne-deployment.yaml | 165 ++++++++++++++---- services/maintenance/ariadne-rbac.yaml | 14 +- .../monitoring/dashboards/atlas-testing.json | 113 ++++++++++++ .../monitoring/grafana-dashboard-testing.yaml | 113 ++++++++++++ .../vault/scripts/vault_k8s_auth_configure.sh | 2 +- 6 files changed, 399 insertions(+), 33 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 116bf21..a3fb372 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -340,6 +340,8 @@ ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{statu ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" +ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' +ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -2267,6 +2269,29 @@ def build_testing_dashboard(): instant=True, ) ) + panels.append( + stat_panel( + 10, + "Ariadne CI Coverage (%)", + ARIADNE_CI_COVERAGE, + {"h": 4, "w": 6, "x": 0, "y": 22}, + unit="percent", + decimals=1, + instant=True, + legend="{{branch}}", + ) + ) + panels.append( + table_panel( + 11, + "Ariadne CI Tests (latest)", + ARIADNE_CI_TESTS, + {"h": 6, "w": 18, "x": 6, "y": 22}, + unit="none", + transformations=sort_desc, + instant=True, + ) + ) return { "uid": "atlas-testing", diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index cd0d38c..57ce72b 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -20,14 +20,30 @@ spec: prometheus.io/path: "/metrics" vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "maintenance" - vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/portal/atlas-portal-db" + vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | - {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} - export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" + {{ with secret "kv/data/atlas/maintenance/ariadne-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.database_url }}" {{ end }} {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }} export KEYCLOAK_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" {{ end }} + {{ with secret "kv/data/atlas/nextcloud/nextcloud-db" }} + export NEXTCLOUD_DB_NAME="{{ .Data.data.database }}" + export NEXTCLOUD_DB_USER="{{ index .Data.data "db-username" }}" + export NEXTCLOUD_DB_PASSWORD="{{ index .Data.data "db-password" }}" + {{ end }} + {{ with secret "kv/data/atlas/nextcloud/nextcloud-admin" }} + export NEXTCLOUD_ADMIN_USER="{{ index .Data.data "admin-user" }}" + export NEXTCLOUD_ADMIN_PASSWORD="{{ index .Data.data "admin-password" }}" + {{ end }} + {{ with secret "kv/data/atlas/health/wger-admin" }} + export WGER_ADMIN_USERNAME="{{ .Data.data.username }}" + export WGER_ADMIN_PASSWORD="{{ .Data.data.password }}" + {{ end }} + {{ with secret "kv/data/atlas/finance/firefly-secrets" }} + export FIREFLY_CRON_TOKEN="{{ .Data.data.STATIC_CRON_TOKEN }}" + {{ end }} {{ with secret "kv/data/atlas/mailu/mailu-db-secret" }} export MAILU_DB_NAME="{{ .Data.data.database }}" export MAILU_DB_USER="{{ .Data.data.username }}" @@ -42,6 +58,35 @@ spec: export SMTP_PASSWORD="{{ .Data.data.password }}" export SMTP_FROM="no-reply-portal@bstein.dev" {{ end }} + {{ with secret "kv/data/atlas/comms/mas-admin-client-runtime" }} + export COMMS_MAS_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" + {{ end }} + {{ with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" }} + export COMMS_BOT_PASSWORD="{{ index .Data.data "bot-password" }}" + export COMMS_SEEDER_PASSWORD="{{ index .Data.data "seeder-password" }}" + {{ end }} + {{ with secret "kv/data/atlas/comms/synapse-db" }} + export COMMS_SYNAPSE_DB_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}" + {{ end }} + {{ with secret "kv/data/atlas/vault/vault-oidc-config" }} + export VAULT_OIDC_DISCOVERY_URL="{{ .Data.data.discovery_url }}" + export VAULT_OIDC_CLIENT_ID="{{ .Data.data.client_id }}" + export VAULT_OIDC_CLIENT_SECRET="{{ .Data.data.client_secret }}" + export VAULT_OIDC_DEFAULT_ROLE="{{ .Data.data.default_role }}" + export VAULT_OIDC_SCOPES="{{ .Data.data.scopes }}" + export VAULT_OIDC_USER_CLAIM="{{ .Data.data.user_claim }}" + export VAULT_OIDC_GROUPS_CLAIM="{{ .Data.data.groups_claim }}" + export VAULT_OIDC_TOKEN_POLICIES="{{ .Data.data.token_policies }}" + export VAULT_OIDC_ADMIN_GROUP="{{ .Data.data.admin_group }}" + export VAULT_OIDC_ADMIN_POLICIES="{{ .Data.data.admin_policies }}" + export VAULT_OIDC_DEV_GROUP="{{ .Data.data.dev_group }}" + export VAULT_OIDC_DEV_POLICIES="{{ .Data.data.dev_policies }}" + export VAULT_OIDC_USER_GROUP="{{ .Data.data.user_group }}" + export VAULT_OIDC_USER_POLICIES="{{ .Data.data.user_policies }}" + export VAULT_OIDC_REDIRECT_URIS="{{ .Data.data.redirect_uris }}" + export VAULT_OIDC_BOUND_AUDIENCES="{{ .Data.data.bound_audiences }}" + export VAULT_OIDC_BOUND_CLAIMS_TYPE="{{ .Data.data.bound_claims_type }}" + {{ end }} spec: serviceAccountName: ariadne nodeSelector: @@ -92,6 +137,8 @@ spec: value: dev - name: MAILU_DOMAIN value: bstein.dev + - name: MAILU_HOST + value: mail.bstein.dev - name: MAILU_SYNC_URL value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC @@ -102,46 +149,84 @@ spec: value: "5432" - name: NEXTCLOUD_NAMESPACE value: nextcloud - - name: NEXTCLOUD_MAIL_SYNC_CRONJOB - value: nextcloud-mail-sync - - name: NEXTCLOUD_MAIL_SYNC_WAIT_TIMEOUT_SEC - value: "90" - - name: NEXTCLOUD_MAIL_SYNC_JOB_TTL_SEC - value: "3600" + - name: NEXTCLOUD_POD_LABEL + value: app=nextcloud + - name: NEXTCLOUD_CONTAINER + value: nextcloud + - name: NEXTCLOUD_EXEC_TIMEOUT_SEC + value: "120" + - name: NEXTCLOUD_URL + value: https://cloud.bstein.dev + - name: NEXTCLOUD_DB_HOST + value: postgres-service.postgres.svc.cluster.local + - name: NEXTCLOUD_DB_PORT + value: "5432" - name: WGER_NAMESPACE value: health - - name: WGER_USER_SYNC_CRONJOB - value: wger-user-sync - - name: WGER_ADMIN_CRONJOB - value: wger-admin-ensure - name: WGER_USER_SYNC_WAIT_TIMEOUT_SEC value: "90" + - name: WGER_POD_LABEL + value: app=wger + - name: WGER_CONTAINER + value: wger + - name: WGER_ADMIN_EMAIL + value: brad@bstein.dev - name: FIREFLY_NAMESPACE value: finance - - name: FIREFLY_USER_SYNC_CRONJOB - value: firefly-user-sync - name: FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC value: "90" + - name: FIREFLY_POD_LABEL + value: app=firefly + - name: FIREFLY_CONTAINER + value: firefly + - name: FIREFLY_CRON_BASE_URL + value: http://firefly.finance.svc.cluster.local/api/v1/cron + - name: FIREFLY_CRON_TIMEOUT_SEC + value: "30" - name: VAULT_NAMESPACE value: vault - - name: VAULT_K8S_AUTH_CRONJOB - value: vault-k8s-auth-config - - name: VAULT_OIDC_CRONJOB - value: vault-oidc-config - - name: VAULT_JOB_WAIT_TIMEOUT_SEC - value: "120" + - name: VAULT_ADDR + value: http://vault.vault.svc.cluster.local:8200 + - name: VAULT_K8S_ROLE + value: vault-admin + - name: VAULT_K8S_ROLE_TTL + value: 1h - name: COMMS_NAMESPACE value: comms - - name: COMMS_GUEST_NAME_CRONJOB - value: guest-name-randomizer - - name: COMMS_PIN_INVITE_CRONJOB - value: pin-othrys-invite - - name: COMMS_RESET_ROOM_CRONJOB - value: othrys-room-reset - - name: COMMS_SEED_ROOM_CRONJOB - value: seed-othrys-room - - name: COMMS_JOB_WAIT_TIMEOUT_SEC - value: "60" + - name: COMMS_SYNAPSE_BASE + value: http://othrys-synapse-matrix-synapse:8008 + - name: COMMS_AUTH_BASE + value: http://matrix-authentication-service:8080 + - name: COMMS_MAS_ADMIN_API_BASE + value: http://matrix-authentication-service:8081/api/admin/v1 + - name: COMMS_MAS_TOKEN_URL + value: http://matrix-authentication-service:8080/oauth2/token + - name: COMMS_MAS_ADMIN_CLIENT_ID + value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM + - name: COMMS_SERVER_NAME + value: live.bstein.dev + - name: COMMS_ROOM_ALIAS + value: "#othrys:live.bstein.dev" + - name: COMMS_ROOM_NAME + value: Othrys + - name: COMMS_PIN_MESSAGE + value: "Invite guests: share https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join and choose 'Continue' -> 'Join as guest'." + - name: COMMS_SEEDER_USER + value: othrys-seeder + - name: COMMS_BOT_USER + value: atlasbot + - name: COMMS_SYNAPSE_DB_HOST + value: postgres-service.postgres.svc.cluster.local + - name: COMMS_SYNAPSE_DB_PORT + value: "5432" + - name: COMMS_SYNAPSE_DB_NAME + value: synapse + - name: COMMS_SYNAPSE_DB_USER + value: synapse + - name: COMMS_TIMEOUT_SEC + value: "30" + - name: COMMS_GUEST_STALE_DAYS + value: "14" - name: VAULTWARDEN_NAMESPACE value: vaultwarden - name: VAULTWARDEN_POD_LABEL @@ -172,10 +257,22 @@ spec: value: "30 4 * * *" - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC value: "0 5 * * *" + - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON + value: "*/5 * * * *" + - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE + value: "30 4 * * *" - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_WGER_ADMIN value: "15 3 * * *" + - name: ARIADNE_SCHEDULE_FIREFLY_CRON + value: "0 3 * * *" + - name: ARIADNE_SCHEDULE_POD_CLEANER + value: "0 * * * *" + - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE + value: "23 3 * * *" + - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER + value: "30 4 * * 0" - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_VAULT_OIDC @@ -192,6 +289,12 @@ spec: value: "true" - name: K8S_API_TIMEOUT_SEC value: "5" + - name: OPENSEARCH_URL + value: http://opensearch-master.logging.svc.cluster.local:9200 + - name: OPENSEARCH_LIMIT_BYTES + value: "1099511627776" + - name: OPENSEARCH_INDEX_PATTERNS + value: kube-*,journald-*,trace-analytics-* - name: METRICS_PATH value: "/metrics" resources: diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml index 8d2a2a9..e2f08c9 100644 --- a/services/maintenance/ariadne-rbac.yaml +++ b/services/maintenance/ariadne-rbac.yaml @@ -6,13 +6,25 @@ metadata: rules: - apiGroups: ["batch"] resources: - - cronjobs - jobs verbs: - get - list - watch - create + - apiGroups: [""] + resources: + - pods + verbs: + - get + - list + - watch + - delete + - apiGroups: [""] + resources: + - pods/exec + verbs: + - create --- apiVersion: rbac.authorization.k8s.io/v1 diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index c9c0c9a..b76f909 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -471,6 +471,119 @@ } } ] + }, + { + "id": 10, + "type": "stat", + "title": "Ariadne CI Coverage (%)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 22 + }, + "targets": [ + { + "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}", + "refId": "A", + "legendFormat": "{{branch}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 11, + "type": "table", + "title": "Ariadne CI Tests (latest)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 18, + "x": 6, + "y": 22 + }, + "targets": [ + { + "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] } ], "time": { diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 7746f16..09c29a4 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -480,6 +480,119 @@ data: } } ] + }, + { + "id": 10, + "type": "stat", + "title": "Ariadne CI Coverage (%)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 22 + }, + "targets": [ + { + "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}", + "refId": "A", + "legendFormat": "{{branch}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 11, + "type": "table", + "title": "Ariadne CI Tests (latest)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 18, + "x": 6, + "y": 22 + }, + "targets": [ + { + "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] } ], "time": { diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index a5ccb61..c14c5ec 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ - "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" "" + "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ From fb6ddce0c72105cc5812e80b5fd5546fd042ad7a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 02:57:40 -0300 Subject: [PATCH 061/416] glue: centralize sync tasks in ariadne --- .../cert-manager/letsencrypt-prod.yaml | 2 +- .../sources/cert-manager/letsencrypt.yaml | 2 +- scripts/dashboards_render_atlas.py | 35 +++++- services/finance/firefly-cronjob.yaml | 1 + services/keycloak/deployment.yaml | 2 +- services/keycloak/realm-settings-job.yaml | 73 +++++++++++++ .../logging/opensearch-prune-cronjob.yaml | 1 + services/mailu/kustomization.yaml | 5 - services/maintenance/ariadne-deployment.yaml | 12 +- .../maintenance/image-sweeper-cronjob.yaml | 1 + services/maintenance/pod-cleaner-cronjob.yaml | 1 + .../monitoring/dashboards/atlas-testing.json | 103 ++++++++++++++++-- .../monitoring/grafana-dashboard-testing.yaml | 103 ++++++++++++++++-- services/nextcloud/cronjob.yaml | 1 + services/nextcloud/maintenance-cronjob.yaml | 1 + 15 files changed, 313 insertions(+), 30 deletions(-) diff --git a/infrastructure/sources/cert-manager/letsencrypt-prod.yaml b/infrastructure/sources/cert-manager/letsencrypt-prod.yaml index 7f90f01..5795b09 100644 --- a/infrastructure/sources/cert-manager/letsencrypt-prod.yaml +++ b/infrastructure/sources/cert-manager/letsencrypt-prod.yaml @@ -5,7 +5,7 @@ metadata: name: letsencrypt-prod spec: acme: - email: brad.stein@gmail.com + email: brad@bstein.dev server: https://acme-v02.api.letsencrypt.org/directory privateKeySecretRef: name: letsencrypt-prod-account-key diff --git a/infrastructure/sources/cert-manager/letsencrypt.yaml b/infrastructure/sources/cert-manager/letsencrypt.yaml index a988312..5fbe4e3 100644 --- a/infrastructure/sources/cert-manager/letsencrypt.yaml +++ b/infrastructure/sources/cert-manager/letsencrypt.yaml @@ -5,7 +5,7 @@ metadata: name: letsencrypt spec: acme: - email: brad.stein@gmail.com + email: brad@bstein.dev server: https://acme-v02.api.letsencrypt.org/directory privateKeySecretRef: name: letsencrypt-account-key diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index a3fb372..509cf49 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -338,7 +338,9 @@ GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' +ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))' ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" +ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' @@ -2236,12 +2238,24 @@ def build_testing_dashboard(): instant=True, ) ) + panels.append( + timeseries_panel( + 12, + "Ariadne Task Runs vs Errors (1h)", + ARIADNE_TASK_RUNS_BY_STATUS_1H, + {"h": 6, "w": 24, "x": 0, "y": 12}, + unit="none", + legend="{{status}}", + legend_display="table", + legend_placement="right", + ) + ) panels.append( table_panel( 7, "Ariadne Task Errors (24h)", ARIADNE_TASK_ERRORS_24H, - {"h": 6, "w": 12, "x": 0, "y": 12}, + {"h": 6, "w": 12, "x": 0, "y": 18}, unit="none", transformations=sort_desc, instant=True, @@ -2252,7 +2266,7 @@ def build_testing_dashboard(): 8, "Ariadne Schedule Last Success (hours ago)", ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, - {"h": 6, "w": 12, "x": 12, "y": 12}, + {"h": 6, "w": 12, "x": 12, "y": 18}, unit="h", transformations=sort_desc, instant=True, @@ -2263,18 +2277,29 @@ def build_testing_dashboard(): 9, "Ariadne Access Requests", ARIADNE_ACCESS_REQUESTS, - {"h": 4, "w": 24, "x": 0, "y": 18}, + {"h": 6, "w": 12, "x": 12, "y": 24}, unit="none", transformations=sort_desc, instant=True, ) ) + panels.append( + table_panel( + 13, + "Ariadne Schedule Last Error (hours ago)", + ARIADNE_SCHEDULE_LAST_ERROR_HOURS, + {"h": 6, "w": 12, "x": 0, "y": 24}, + unit="h", + transformations=sort_desc, + instant=True, + ) + ) panels.append( stat_panel( 10, "Ariadne CI Coverage (%)", ARIADNE_CI_COVERAGE, - {"h": 4, "w": 6, "x": 0, "y": 22}, + {"h": 4, "w": 6, "x": 0, "y": 30}, unit="percent", decimals=1, instant=True, @@ -2286,7 +2311,7 @@ def build_testing_dashboard(): 11, "Ariadne CI Tests (latest)", ARIADNE_CI_TESTS, - {"h": 6, "w": 18, "x": 6, "y": 22}, + {"h": 6, "w": 18, "x": 6, "y": 30}, unit="none", transformations=sort_desc, instant=True, diff --git a/services/finance/firefly-cronjob.yaml b/services/finance/firefly-cronjob.yaml index 6c4d507..9e5c852 100644 --- a/services/finance/firefly-cronjob.yaml +++ b/services/finance/firefly-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: finance spec: schedule: "0 3 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/keycloak/deployment.yaml b/services/keycloak/deployment.yaml index 3d241c9..131169d 100644 --- a/services/keycloak/deployment.yaml +++ b/services/keycloak/deployment.yaml @@ -126,7 +126,7 @@ spec: - name: KC_EVENTS_LISTENERS value: jboss-logging,mailu-http - name: KC_SPI_EVENTS_LISTENER_MAILU-HTTP_ENDPOINT - value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events + value: http://ariadne.maintenance.svc.cluster.local/events ports: - containerPort: 8080 name: http diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index fdee377..786948b 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -469,6 +469,79 @@ spec: if status not in (201, 204): raise SystemExit(f"Unexpected protocol mapper create response: {status}") + # Ensure mailu_email overrides email claim for service clients. + excluded_email_clients = { + "account", + "account-console", + "admin-cli", + "security-admin-console", + "realm-management", + "broker", + } + status, clients = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients", + access_token, + ) + if status == 200 and isinstance(clients, list): + for client in clients: + if not isinstance(client, dict): + continue + if client.get("protocol") != "openid-connect": + continue + client_name = client.get("clientId") if isinstance(client.get("clientId"), str) else "" + if not client_name or client_name in excluded_email_clients: + continue + client_id = client.get("id") + if not client_id: + continue + email_mapper = { + "name": "mailu-email", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": False, + "config": { + "user.attribute": "mailu_email", + "claim.name": "email", + "jsonType.label": "String", + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true", + "multivalued": "false", + "aggregate.attrs": "false", + }, + } + status, mappers = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + ) + existing = None + if status == 200 and isinstance(mappers, list): + for item in mappers: + if isinstance(item, dict) and item.get("name") == email_mapper["name"]: + existing = item + break + if existing and existing.get("id"): + email_mapper["id"] = existing["id"] + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models/{existing['id']}", + access_token, + email_mapper, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected mailu email mapper update response: {status}") + else: + status, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + email_mapper, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected mailu email mapper create response: {status}") + # Ensure MFA is on by default for newly-created users. status, required_actions = http_json( "GET", diff --git a/services/logging/opensearch-prune-cronjob.yaml b/services/logging/opensearch-prune-cronjob.yaml index 75e72db..dc0dffb 100644 --- a/services/logging/opensearch-prune-cronjob.yaml +++ b/services/logging/opensearch-prune-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: logging spec: schedule: "23 3 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/mailu/kustomization.yaml b/services/mailu/kustomization.yaml index 5c111eb..7447f24 100644 --- a/services/mailu/kustomization.yaml +++ b/services/mailu/kustomization.yaml @@ -15,7 +15,6 @@ resources: - ingressroute.yaml - mailu-sync-job.yaml - mailu-sync-cronjob.yaml - - mailu-sync-listener.yaml - front-lb.yaml configMapGenerator: @@ -31,10 +30,6 @@ configMapGenerator: - sync.py=scripts/mailu_sync.py options: disableNameSuffixHash: true - - name: mailu-sync-listener - namespace: mailu-mailserver - files: - - listener.py=scripts/mailu_sync_listener.py - name: mailu-vault-entrypoint namespace: mailu-mailserver files: diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 57ce72b..57862ab 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -23,6 +23,7 @@ spec: vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | {{ with secret "kv/data/atlas/maintenance/ariadne-db" }} + export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}" export PORTAL_DATABASE_URL="{{ .Data.data.database_url }}" {{ end }} {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }} @@ -57,6 +58,7 @@ spec: export SMTP_USERNAME="no-reply-portal@bstein.dev" export SMTP_PASSWORD="{{ .Data.data.password }}" export SMTP_FROM="no-reply-portal@bstein.dev" + export MAILU_SYSTEM_PASSWORD="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/comms/mas-admin-client-runtime" }} export COMMS_MAS_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" @@ -140,7 +142,11 @@ spec: - name: MAILU_HOST value: mail.bstein.dev - name: MAILU_SYNC_URL - value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events + value: http://ariadne.maintenance.svc.cluster.local/events + - name: MAILU_EVENT_MIN_INTERVAL_SEC + value: "10" + - name: MAILU_SYSTEM_USERS + value: no-reply-portal@bstein.dev,no-reply-vaultwarden@bstein.dev - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC value: "180" - name: MAILU_DB_HOST @@ -263,8 +269,12 @@ spec: value: "30 4 * * *" - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC value: "*/15 * * * *" + - name: ARIADNE_SCHEDULE_WGER_USER_SYNC + value: "0 5 * * *" - name: ARIADNE_SCHEDULE_WGER_ADMIN value: "15 3 * * *" + - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC + value: "0 6 * * *" - name: ARIADNE_SCHEDULE_FIREFLY_CRON value: "0 3 * * *" - name: ARIADNE_SCHEDULE_POD_CLEANER diff --git a/services/maintenance/image-sweeper-cronjob.yaml b/services/maintenance/image-sweeper-cronjob.yaml index c94fcca..0039206 100644 --- a/services/maintenance/image-sweeper-cronjob.yaml +++ b/services/maintenance/image-sweeper-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: maintenance spec: schedule: "30 4 * * 0" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 2 failedJobsHistoryLimit: 2 diff --git a/services/maintenance/pod-cleaner-cronjob.yaml b/services/maintenance/pod-cleaner-cronjob.yaml index e083c85..99d13f6 100644 --- a/services/maintenance/pod-cleaner-cronjob.yaml +++ b/services/maintenance/pod-cleaner-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: maintenance spec: schedule: "0 * * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index b76f909..207077e 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -322,6 +322,43 @@ } ] }, + { + "id": 12, + "type": "timeseries", + "title": "Ariadne Task Runs vs Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))", + "refId": "A", + "legendFormat": "{{status}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, { "id": 7, "type": "table", @@ -334,7 +371,7 @@ "h": 6, "w": 12, "x": 0, - "y": 12 + "y": 18 }, "targets": [ { @@ -384,7 +421,7 @@ "h": 6, "w": 12, "x": 12, - "y": 12 + "y": 18 }, "targets": [ { @@ -431,10 +468,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 4, - "w": 24, - "x": 0, - "y": 18 + "h": 6, + "w": 12, + "x": 12, + "y": 24 }, "targets": [ { @@ -472,6 +509,56 @@ } ] }, + { + "id": 13, + "type": "table", + "title": "Ariadne Schedule Last Error (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 24 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 10, "type": "stat", @@ -484,7 +571,7 @@ "h": 4, "w": 6, "x": 0, - "y": 22 + "y": 30 }, "targets": [ { @@ -547,7 +634,7 @@ "h": 6, "w": 18, "x": 6, - "y": 22 + "y": 30 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 09c29a4..362751b 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -331,6 +331,43 @@ data: } ] }, + { + "id": 12, + "type": "timeseries", + "title": "Ariadne Task Runs vs Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))", + "refId": "A", + "legendFormat": "{{status}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, { "id": 7, "type": "table", @@ -343,7 +380,7 @@ data: "h": 6, "w": 12, "x": 0, - "y": 12 + "y": 18 }, "targets": [ { @@ -393,7 +430,7 @@ data: "h": 6, "w": 12, "x": 12, - "y": 12 + "y": 18 }, "targets": [ { @@ -440,10 +477,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 4, - "w": 24, - "x": 0, - "y": 18 + "h": 6, + "w": 12, + "x": 12, + "y": 24 }, "targets": [ { @@ -481,6 +518,56 @@ data: } ] }, + { + "id": 13, + "type": "table", + "title": "Ariadne Schedule Last Error (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 24 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 10, "type": "stat", @@ -493,7 +580,7 @@ data: "h": 4, "w": 6, "x": 0, - "y": 22 + "y": 30 }, "targets": [ { @@ -556,7 +643,7 @@ data: "h": 6, "w": 18, "x": 6, - "y": 22 + "y": 30 }, "targets": [ { diff --git a/services/nextcloud/cronjob.yaml b/services/nextcloud/cronjob.yaml index cc0091b..58d8aa1 100644 --- a/services/nextcloud/cronjob.yaml +++ b/services/nextcloud/cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: nextcloud spec: schedule: "*/5 * * * *" + suspend: true concurrencyPolicy: Forbid jobTemplate: spec: diff --git a/services/nextcloud/maintenance-cronjob.yaml b/services/nextcloud/maintenance-cronjob.yaml index d4008c7..177cc02 100644 --- a/services/nextcloud/maintenance-cronjob.yaml +++ b/services/nextcloud/maintenance-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: nextcloud spec: schedule: "30 4 * * *" + suspend: true concurrencyPolicy: Forbid jobTemplate: spec: From 0d4f14c397040cae6ec96f9dde37bd280650a4ea Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 03:03:32 -0300 Subject: [PATCH 062/416] keycloak: bump realm settings job name --- services/keycloak/realm-settings-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index 786948b..6e6589d 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: keycloak-realm-settings-33 + name: keycloak-realm-settings-34 namespace: sso spec: backoffLimit: 0 From 80a7ec26e21e19c5bfd986fc380594237ec806ec Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 03:05:53 -0300 Subject: [PATCH 063/416] rbac: allow ariadne to read cronjobs --- services/maintenance/ariadne-rbac.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml index e2f08c9..8a063bf 100644 --- a/services/maintenance/ariadne-rbac.yaml +++ b/services/maintenance/ariadne-rbac.yaml @@ -7,6 +7,7 @@ rules: - apiGroups: ["batch"] resources: - jobs + - cronjobs verbs: - get - list From 439d82430046a399c9759c58440400cf0ccd7e9b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 03:21:01 -0300 Subject: [PATCH 064/416] vault: allow ariadne to read needed secrets --- services/vault/scripts/vault_k8s_auth_configure.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index c14c5ec..2fce3f4 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ - "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" "" + "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ From 0efc1ed6c41e301ab7e679fb02d03611cb1eae30 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 03:39:17 -0300 Subject: [PATCH 065/416] ariadne: split portal and ariadne db secrets --- services/maintenance/ariadne-deployment.yaml | 4 +++- services/vault/scripts/vault_k8s_auth_configure.sh | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 57862ab..bb9766f 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -24,7 +24,9 @@ spec: vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | {{ with secret "kv/data/atlas/maintenance/ariadne-db" }} export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}" - export PORTAL_DATABASE_URL="{{ .Data.data.database_url }}" + {{ end }} + {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" {{ end }} {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }} export KEYCLOAK_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index 2fce3f4..bc03cf4 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ - "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" "" + "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ From b87fe4899ca137914d29856d8f80a765359fd53e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 03:53:34 -0300 Subject: [PATCH 066/416] maintenance: bump ariadne image tag --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index a86453e..fd54441 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-6 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-10 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From a41ac1548cb51aeaafa07f6974766a6f74180b4c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 04:05:41 -0300 Subject: [PATCH 067/416] maintenance: fix ariadne comms endpoints and exec RBAC --- services/maintenance/ariadne-deployment.yaml | 8 ++++---- services/maintenance/ariadne-rbac.yaml | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index bb9766f..069f388 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -202,13 +202,13 @@ spec: - name: COMMS_NAMESPACE value: comms - name: COMMS_SYNAPSE_BASE - value: http://othrys-synapse-matrix-synapse:8008 + value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008 - name: COMMS_AUTH_BASE - value: http://matrix-authentication-service:8080 + value: http://matrix-authentication-service.comms.svc.cluster.local:8080 - name: COMMS_MAS_ADMIN_API_BASE - value: http://matrix-authentication-service:8081/api/admin/v1 + value: http://matrix-authentication-service.comms.svc.cluster.local:8081/api/admin/v1 - name: COMMS_MAS_TOKEN_URL - value: http://matrix-authentication-service:8080/oauth2/token + value: http://matrix-authentication-service.comms.svc.cluster.local:8080/oauth2/token - name: COMMS_MAS_ADMIN_CLIENT_ID value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM - name: COMMS_SERVER_NAME diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml index 8a063bf..88689cb 100644 --- a/services/maintenance/ariadne-rbac.yaml +++ b/services/maintenance/ariadne-rbac.yaml @@ -25,6 +25,7 @@ rules: resources: - pods/exec verbs: + - get - create --- From 5e4ed17942bb8be5ad56005175bc3e13c07ee262 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 05:03:26 -0300 Subject: [PATCH 068/416] maintenance: bump ariadne image tag --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index fd54441..a53ffee 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-10 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From 810e4c0efbc56ebe072e2298b29153222a29986c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 10:20:53 -0300 Subject: [PATCH 069/416] flux: align imagepolicy tag setters --- services/bstein-dev-home/kustomization.yaml | 4 ++-- services/maintenance/kustomization.yaml | 2 +- services/pegasus/deployment.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index ec137dc..26840ab 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,9 +20,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"} + newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"} + newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index a53ffee..daee5f1 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script diff --git a/services/pegasus/deployment.yaml b/services/pegasus/deployment.yaml index bc3db70..b6a1639 100644 --- a/services/pegasus/deployment.yaml +++ b/services/pegasus/deployment.yaml @@ -72,7 +72,7 @@ spec: containers: - name: pegasus - image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus"} + image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus:tag"} imagePullPolicy: Always env: - name: PEGASUS_MEDIA_ROOT From 6ac3b41b30a82e233d2e8ad23b3a53ad31476a05 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 10:33:06 -0300 Subject: [PATCH 070/416] flux: align image automation namespaces --- .../applications/bstein-dev-home/image-automation.yaml | 2 +- .../flux-system/applications/pegasus/image-automation.yaml | 2 +- .../flux-system/platform/maintenance/image-automation.yaml | 2 +- services/pegasus/kustomization.yaml | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml index 643d479..10d7913 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml @@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1 kind: ImageUpdateAutomation metadata: name: bstein-dev-home - namespace: flux-system + namespace: bstein-dev-home spec: interval: 1m0s sourceRef: diff --git a/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml b/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml index ec0494e..d11422a 100644 --- a/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml @@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1 kind: ImageUpdateAutomation metadata: name: pegasus - namespace: flux-system + namespace: jellyfin spec: interval: 1m0s sourceRef: diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml index 867cae4..9f3214b 100644 --- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml +++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml @@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1 kind: ImageUpdateAutomation metadata: name: maintenance - namespace: flux-system + namespace: maintenance spec: interval: 1m0s sourceRef: diff --git a/services/pegasus/kustomization.yaml b/services/pegasus/kustomization.yaml index bef2b40..05c3baa 100644 --- a/services/pegasus/kustomization.yaml +++ b/services/pegasus/kustomization.yaml @@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - configmap.yaml + - image.yaml - vault-serviceaccount.yaml - secretproviderclass.yaml - service.yaml From 4de4630911703f0f4b2cc30dfc6e0575d7d6d599 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 10:34:25 -0300 Subject: [PATCH 071/416] flux: fix image automation templates --- .../applications/bstein-dev-home/image-automation.yaml | 2 +- .../flux-system/platform/maintenance/image-automation.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml index 10d7913..8b2900c 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml @@ -18,7 +18,7 @@ spec: author: email: ops@bstein.dev name: flux-bot - messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}" + messageTemplate: "chore(bstein-dev-home): update images to {{range .Changed.Images}}{{.}}{{end}}" push: branch: feature/ariadne update: diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml index 9f3214b..48e4c30 100644 --- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml +++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml @@ -18,7 +18,7 @@ spec: author: email: ops@bstein.dev name: flux-bot - messageTemplate: "chore(maintenance): update images to {{range .Updated.Images}}{{.}}{{end}}" + messageTemplate: "chore(maintenance): update images to {{range .Changed.Images}}{{.}}{{end}}" push: branch: feature/ariadne update: From 7cf5e7e39d76aa9bc2a5c440874a0e5ea5d5fa3a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 10:35:29 -0300 Subject: [PATCH 072/416] flux: simplify image automation messages --- .../applications/bstein-dev-home/image-automation.yaml | 2 +- .../flux-system/platform/maintenance/image-automation.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml index 8b2900c..f1d41be 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml @@ -18,7 +18,7 @@ spec: author: email: ops@bstein.dev name: flux-bot - messageTemplate: "chore(bstein-dev-home): update images to {{range .Changed.Images}}{{.}}{{end}}" + messageTemplate: "chore(bstein-dev-home): automated image update" push: branch: feature/ariadne update: diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml index 48e4c30..6e8f612 100644 --- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml +++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml @@ -18,7 +18,7 @@ spec: author: email: ops@bstein.dev name: flux-bot - messageTemplate: "chore(maintenance): update images to {{range .Changed.Images}}{{.}}{{end}}" + messageTemplate: "chore(maintenance): automated image update" push: branch: feature/ariadne update: From 4484fed039ca841278f13e26a2429799c91b1700 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 13:35:55 +0000 Subject: [PATCH 073/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index daee5f1..088ce48 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -23,11 +23,9 @@ resources: - node-image-sweeper-serviceaccount.yaml - node-image-sweeper-daemonset.yaml - image-sweeper-cronjob.yaml - images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne:tag"} - + newTag: 0.1.0-15 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From d8a3b5250ee165dd24d9bd78d8ac0c0ca48f5a93 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 13:36:39 +0000 Subject: [PATCH 074/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 26840ab..78f5e68 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,9 +20,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From a9f6b04baafdaa1a18c38808c83edc6287fda588 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 14:04:54 +0000 Subject: [PATCH 075/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 088ce48..05f3be2 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-15 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-16 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 698b2fd96bacc582fe4bf0101c99270a8acce4be Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 11:29:29 -0300 Subject: [PATCH 076/416] monitoring: refresh testing dashboard --- .gitignore | 1 + scripts/dashboards_render_atlas.py | 200 ++-- .../monitoring/dashboards/atlas-testing.json | 896 ++++++++++++------ .../monitoring/grafana-dashboard-testing.yaml | 896 ++++++++++++------ 4 files changed, 1401 insertions(+), 592 deletions(-) diff --git a/.gitignore b/.gitignore index 8d0ab1e..7543bbf 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ __pycache__/ *.py[cod] .pytest_cache .venv +.venv-ci tmp/ diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 509cf49..6eaafb4 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -339,6 +339,9 @@ GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))' +ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' +ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))' +ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))' ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" @@ -696,8 +699,10 @@ def bargauge_panel( grid, *, unit="none", + legend=None, links=None, limit=None, + sort_order="desc", thresholds=None, decimals=None, instant=False, @@ -710,7 +715,12 @@ def bargauge_panel( "datasource": PROM_DS, "gridPos": grid, "targets": [ - {"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})} + { + "expr": expr, + "refId": "A", + "legendFormat": legend or "{{node}}", + **({"instant": True} if instant else {}), + } ], "fieldConfig": { "defaults": { @@ -748,7 +758,7 @@ def bargauge_panel( panel["transformations"] = [ { "id": "sortBy", - "options": {"fields": ["Value"], "order": "desc"}, + "options": {"fields": ["Value"], "order": sort_order}, } ] if limit: @@ -2163,7 +2173,24 @@ def build_mail_dashboard(): def build_testing_dashboard(): panels = [] - sort_desc = [{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}] + age_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 6}, + {"color": "orange", "value": 24}, + {"color": "red", "value": 48}, + ], + } + recent_error_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 1}, + {"color": "yellow", "value": 6}, + {"color": "green", "value": 24}, + ], + } panels.append( stat_panel( @@ -2184,66 +2211,56 @@ def build_testing_dashboard(): ) ) panels.append( - table_panel( + stat_panel( 2, "Glue Jobs Missing Success", - GLUE_MISSING_ACTIVE, - {"h": 4, "w": 6, "x": 6, "y": 0}, + GLUE_MISSING_COUNT, + {"h": 4, "w": 4, "x": 4, "y": 0}, unit="none", - transformations=sort_desc, - instant=True, ) ) panels.append( - table_panel( + stat_panel( 3, "Glue Jobs Suspended", - GLUE_SUSPENDED, - {"h": 4, "w": 6, "x": 12, "y": 0}, + GLUE_SUSPENDED_COUNT, + {"h": 4, "w": 4, "x": 8, "y": 0}, unit="none", - transformations=sort_desc, - instant=True, ) ) panels.append( - table_panel( + stat_panel( 4, - "Glue Jobs Active Runs", - GLUE_ACTIVE, - {"h": 4, "w": 6, "x": 18, "y": 0}, + "Ariadne Task Errors (1h)", + ARIADNE_TASK_ERRORS_1H_TOTAL, + {"h": 4, "w": 4, "x": 12, "y": 0}, unit="none", - transformations=sort_desc, - instant=True, ) ) panels.append( - table_panel( + stat_panel( 5, - "Glue Jobs Last Success (hours ago)", - GLUE_LAST_SUCCESS_AGE_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 4}, - unit="h", - transformations=sort_desc, - instant=True, + "Ariadne Task Errors (24h)", + ARIADNE_TASK_ERRORS_24H_TOTAL, + {"h": 4, "w": 4, "x": 16, "y": 0}, + unit="none", ) ) panels.append( - table_panel( + stat_panel( 6, - "Glue Jobs Last Schedule (hours ago)", - GLUE_LAST_SCHEDULE_AGE_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 4}, - unit="h", - transformations=sort_desc, - instant=True, + "Ariadne Task Runs (1h)", + ARIADNE_TASK_RUNS_1H_TOTAL, + {"h": 4, "w": 4, "x": 20, "y": 0}, + unit="none", ) ) panels.append( timeseries_panel( - 12, + 7, "Ariadne Task Runs vs Errors (1h)", ARIADNE_TASK_RUNS_BY_STATUS_1H, - {"h": 6, "w": 24, "x": 0, "y": 12}, + {"h": 6, "w": 24, "x": 0, "y": 4}, unit="none", legend="{{status}}", legend_display="table", @@ -2251,55 +2268,110 @@ def build_testing_dashboard(): ) ) panels.append( - table_panel( - 7, + bargauge_panel( + 8, "Ariadne Task Errors (24h)", ARIADNE_TASK_ERRORS_24H, - {"h": 6, "w": 12, "x": 0, "y": 18}, + {"h": 8, "w": 12, "x": 0, "y": 10}, unit="none", - transformations=sort_desc, instant=True, + legend="{{task}}", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 3}, + {"color": "red", "value": 5}, + ], + }, ) ) panels.append( - table_panel( - 8, - "Ariadne Schedule Last Success (hours ago)", - ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, - {"h": 6, "w": 12, "x": 12, "y": 18}, - unit="h", - transformations=sort_desc, - instant=True, - ) - ) - panels.append( - table_panel( + bargauge_panel( 9, - "Ariadne Access Requests", - ARIADNE_ACCESS_REQUESTS, - {"h": 6, "w": 12, "x": 12, "y": 24}, + "Ariadne Task Success (24h)", + ARIADNE_TASK_SUCCESS_24H, + {"h": 8, "w": 12, "x": 12, "y": 10}, unit="none", - transformations=sort_desc, instant=True, + legend="{{task}}", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 1}, + {"color": "yellow", "value": 5}, + {"color": "green", "value": 10}, + ], + }, ) ) panels.append( - table_panel( - 13, + bargauge_panel( + 10, "Ariadne Schedule Last Error (hours ago)", ARIADNE_SCHEDULE_LAST_ERROR_HOURS, - {"h": 6, "w": 12, "x": 0, "y": 24}, + {"h": 8, "w": 12, "x": 0, "y": 18}, unit="h", - transformations=sort_desc, instant=True, + legend="{{task}}", + thresholds=recent_error_thresholds, + ) + ) + panels.append( + bargauge_panel( + 11, + "Ariadne Schedule Last Success (hours ago)", + ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, + {"h": 8, "w": 12, "x": 12, "y": 18}, + unit="h", + instant=True, + legend="{{task}}", + thresholds=age_thresholds, + ) + ) + panels.append( + bargauge_panel( + 12, + "Glue Jobs Last Success (hours ago)", + GLUE_LAST_SUCCESS_AGE_HOURS, + {"h": 8, "w": 12, "x": 0, "y": 26}, + unit="h", + instant=True, + legend="{{namespace}}/{{cronjob}}", + thresholds=age_thresholds, + ) + ) + panels.append( + bargauge_panel( + 13, + "Glue Jobs Last Schedule (hours ago)", + GLUE_LAST_SCHEDULE_AGE_HOURS, + {"h": 8, "w": 12, "x": 12, "y": 26}, + unit="h", + instant=True, + legend="{{namespace}}/{{cronjob}}", + thresholds=age_thresholds, + ) + ) + panels.append( + bargauge_panel( + 14, + "Ariadne Access Requests", + ARIADNE_ACCESS_REQUESTS, + {"h": 6, "w": 8, "x": 0, "y": 34}, + unit="none", + instant=True, + legend="{{status}}", ) ) panels.append( stat_panel( - 10, + 15, "Ariadne CI Coverage (%)", ARIADNE_CI_COVERAGE, - {"h": 4, "w": 6, "x": 0, "y": 30}, + {"h": 6, "w": 4, "x": 8, "y": 34}, unit="percent", decimals=1, instant=True, @@ -2308,12 +2380,12 @@ def build_testing_dashboard(): ) panels.append( table_panel( - 11, + 16, "Ariadne CI Tests (latest)", ARIADNE_CI_TESTS, - {"h": 6, "w": 18, "x": 6, "y": 30}, + {"h": 6, "w": 12, "x": 12, "y": 34}, unit="none", - transformations=sort_desc, + transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], instant=True, ) ) diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index 207077e..420abf2 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -74,7 +74,7 @@ }, { "id": 2, - "type": "table", + "type": "stat", "title": "Glue Jobs Missing Success", "datasource": { "type": "prometheus", @@ -82,49 +82,59 @@ }, "gridPos": { "h": 4, - "w": 6, - "x": 6, + "w": 4, + "x": 4, "y": 0 }, "targets": [ { - "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A", - "instant": true + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 3, - "type": "table", + "type": "stat", "title": "Glue Jobs Suspended", "datasource": { "type": "prometheus", @@ -132,198 +142,238 @@ }, "gridPos": { "h": 4, - "w": 6, - "x": 12, + "w": 4, + "x": 8, "y": 0 }, "targets": [ { - "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1", - "refId": "A", - "instant": true + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 4, - "type": "table", - "title": "Glue Jobs Active Runs", + "type": "stat", + "title": "Ariadne Task Errors (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, - "w": 6, - "x": 18, + "w": 4, + "x": 12, "y": 0 }, "targets": [ { - "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 5, - "type": "table", - "title": "Glue Jobs Last Success (hours ago)", + "type": "stat", + "title": "Ariadne Task Errors (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 + "h": 4, + "w": 4, + "x": 16, + "y": 0 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "h", + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 6, - "type": "table", - "title": "Glue Jobs Last Schedule (hours ago)", + "type": "stat", + "title": "Ariadne Task Runs (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 + "h": 4, + "w": 4, + "x": 20, + "y": 0 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "h", + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { - "id": 12, + "id": 7, "type": "timeseries", "title": "Ariadne Task Runs vs Errors (1h)", "datasource": { @@ -334,7 +384,7 @@ "h": 6, "w": 24, "x": 0, - "y": 12 + "y": 4 }, "targets": [ { @@ -360,94 +410,68 @@ } }, { - "id": 7, - "type": "table", + "id": 8, + "type": "bargauge", "title": "Ariadne Task Errors (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 10 }, "targets": [ { "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", "refId": "A", + "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", - "custom": { - "filterable": true + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 8, - "type": "table", - "title": "Ariadne Schedule Last Success (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 18 - }, - "targets": [ - { - "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, { "id": "sortBy", "options": { @@ -461,93 +485,67 @@ }, { "id": 9, - "type": "table", - "title": "Ariadne Access Requests", + "type": "bargauge", + "title": "Ariadne Task Success (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 8, "w": 12, "x": 12, - "y": 24 + "y": 10 }, "targets": [ { - "expr": "ariadne_access_requests_total", + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))", "refId": "A", + "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", - "custom": { - "filterable": true + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "green", + "value": 10 + } + ] } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 13, - "type": "table", - "title": "Ariadne Schedule Last Error (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 24 - }, - "targets": [ - { - "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, { "id": "sortBy", "options": { @@ -561,6 +559,376 @@ }, { "id": 10, + "type": "bargauge", + "title": "Ariadne Schedule Last Error (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "green", + "value": 24 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 11, + "type": "bargauge", + "title": "Ariadne Schedule Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 12, + "type": "bargauge", + "title": "Glue Jobs Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "targets": [ + { + "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 13, + "type": "bargauge", + "title": "Glue Jobs Last Schedule (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "targets": [ + { + "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 14, + "type": "bargauge", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 34 + }, + "targets": [ + { + "expr": "ariadne_access_requests_total", + "refId": "A", + "legendFormat": "{{status}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, "type": "stat", "title": "Ariadne CI Coverage (%)", "datasource": { @@ -568,10 +936,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 30 + "h": 6, + "w": 4, + "x": 8, + "y": 34 }, "targets": [ { @@ -623,7 +991,7 @@ } }, { - "id": 11, + "id": 16, "type": "table", "title": "Ariadne CI Tests (latest)", "datasource": { @@ -632,9 +1000,9 @@ }, "gridPos": { "h": 6, - "w": 18, - "x": 6, - "y": 30 + "w": 12, + "x": 12, + "y": 34 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 362751b..52b2836 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -83,7 +83,7 @@ data: }, { "id": 2, - "type": "table", + "type": "stat", "title": "Glue Jobs Missing Success", "datasource": { "type": "prometheus", @@ -91,49 +91,59 @@ data: }, "gridPos": { "h": 4, - "w": 6, - "x": 6, + "w": 4, + "x": 4, "y": 0 }, "targets": [ { - "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A", - "instant": true + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 3, - "type": "table", + "type": "stat", "title": "Glue Jobs Suspended", "datasource": { "type": "prometheus", @@ -141,198 +151,238 @@ data: }, "gridPos": { "h": 4, - "w": 6, - "x": 12, + "w": 4, + "x": 8, "y": 0 }, "targets": [ { - "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1", - "refId": "A", - "instant": true + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 4, - "type": "table", - "title": "Glue Jobs Active Runs", + "type": "stat", + "title": "Ariadne Task Errors (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, - "w": 6, - "x": 18, + "w": 4, + "x": 12, "y": 0 }, "targets": [ { - "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 5, - "type": "table", - "title": "Glue Jobs Last Success (hours ago)", + "type": "stat", + "title": "Ariadne Task Errors (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 + "h": 4, + "w": 4, + "x": 16, + "y": 0 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "h", + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 6, - "type": "table", - "title": "Glue Jobs Last Schedule (hours ago)", + "type": "stat", + "title": "Ariadne Task Runs (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 + "h": 4, + "w": 4, + "x": 20, + "y": 0 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "h", + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { - "id": 12, + "id": 7, "type": "timeseries", "title": "Ariadne Task Runs vs Errors (1h)", "datasource": { @@ -343,7 +393,7 @@ data: "h": 6, "w": 24, "x": 0, - "y": 12 + "y": 4 }, "targets": [ { @@ -369,94 +419,68 @@ data: } }, { - "id": 7, - "type": "table", + "id": 8, + "type": "bargauge", "title": "Ariadne Task Errors (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 10 }, "targets": [ { "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", "refId": "A", + "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", - "custom": { - "filterable": true + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 8, - "type": "table", - "title": "Ariadne Schedule Last Success (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 18 - }, - "targets": [ - { - "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, { "id": "sortBy", "options": { @@ -470,93 +494,67 @@ data: }, { "id": 9, - "type": "table", - "title": "Ariadne Access Requests", + "type": "bargauge", + "title": "Ariadne Task Success (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 8, "w": 12, "x": 12, - "y": 24 + "y": 10 }, "targets": [ { - "expr": "ariadne_access_requests_total", + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))", "refId": "A", + "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", - "custom": { - "filterable": true + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "green", + "value": 10 + } + ] } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 13, - "type": "table", - "title": "Ariadne Schedule Last Error (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 24 - }, - "targets": [ - { - "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, { "id": "sortBy", "options": { @@ -570,6 +568,376 @@ data: }, { "id": 10, + "type": "bargauge", + "title": "Ariadne Schedule Last Error (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "green", + "value": 24 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 11, + "type": "bargauge", + "title": "Ariadne Schedule Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 12, + "type": "bargauge", + "title": "Glue Jobs Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "targets": [ + { + "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 13, + "type": "bargauge", + "title": "Glue Jobs Last Schedule (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "targets": [ + { + "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 14, + "type": "bargauge", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 34 + }, + "targets": [ + { + "expr": "ariadne_access_requests_total", + "refId": "A", + "legendFormat": "{{status}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, "type": "stat", "title": "Ariadne CI Coverage (%)", "datasource": { @@ -577,10 +945,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 30 + "h": 6, + "w": 4, + "x": 8, + "y": 34 }, "targets": [ { @@ -632,7 +1000,7 @@ data: } }, { - "id": 11, + "id": 16, "type": "table", "title": "Ariadne CI Tests (latest)", "datasource": { @@ -641,9 +1009,9 @@ data: }, "gridPos": { "h": 6, - "w": 18, - "x": 6, - "y": 30 + "w": 12, + "x": 12, + "y": 34 }, "targets": [ { From 98b063f2dd7b57413b3e7d672336b3a6f7dbd95f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 11:45:11 -0300 Subject: [PATCH 077/416] grafana: allow email-based oauth user lookup --- services/monitoring/helmrelease.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 304de05..02bc482 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -354,6 +354,8 @@ spec: GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "contains(groups, 'admin') && 'Admin' || 'Viewer'" GF_AUTH_GENERIC_OAUTH_USE_PKCE: "true" GF_AUTH_GENERIC_OAUTH_TLS_SKIP_VERIFY_INSECURE: "false" + GF_AUTH_GENERIC_OAUTH_ALLOW_INSECURE_EMAIL_LOOKUP: "true" + GF_AUTH_GENERIC_OAUTH_EMAIL_ATTRIBUTE_PATH: "email" GF_AUTH_SIGNOUT_REDIRECT_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/logout?redirect_uri=https://metrics.bstein.dev/" grafana.ini: server: From 6eeb5512397ff450ea03c06a36b288b0808468d4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:08:23 -0300 Subject: [PATCH 078/416] monitoring: add grafana user dedupe job --- .../monitoring/grafana-user-dedupe-job.yaml | 51 +++++++++++++++++++ services/monitoring/kustomization.yaml | 1 + 2 files changed, 52 insertions(+) create mode 100644 services/monitoring/grafana-user-dedupe-job.yaml diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml new file mode 100644 index 0000000..b633a19 --- /dev/null +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -0,0 +1,51 @@ +# services/monitoring/grafana-user-dedupe-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: grafana-user-dedupe + namespace: monitoring +spec: + backoffLimit: 1 + template: + spec: + restartPolicy: Never + containers: + - name: dedupe + image: alpine:3.20 + command: + - /bin/sh + - -c + args: + - | + set -euo pipefail + apk add --no-cache sqlite + db="/var/lib/grafana/grafana.db" + if [ ! -f "$db" ]; then + echo "grafana db not found at $db" + exit 1 + fi + if [ -z "${GRAFANA_DEDUPE_EMAILS}" ]; then + echo "GRAFANA_DEDUPE_EMAILS is required" + exit 1 + fi + for email in $(echo "${GRAFANA_DEDUPE_EMAILS}" | tr ',' ' '); do + ids="$(sqlite3 "$db" "select id from user where email = '${email}';")" + if [ -z "$ids" ]; then + echo "no grafana user found for ${email}" + continue + fi + echo "deleting grafana users with ids: ${ids}" + sqlite3 "$db" "delete from user_auth where user_id in (${ids});" + sqlite3 "$db" "delete from user where id in (${ids});" + done + echo "done" + env: + - name: GRAFANA_DEDUPE_EMAILS + value: brad.stein@gmail.com,brad@bstein.dev + volumeMounts: + - name: grafana-storage + mountPath: /var/lib/grafana + volumes: + - name: grafana-storage + persistentVolumeClaim: + claimName: grafana diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 7d0b01b..86ab826 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -24,6 +24,7 @@ resources: - grafana-folders.yaml - helmrelease.yaml - grafana-org-bootstrap.yaml + - grafana-user-dedupe-job.yaml configMapGenerator: - name: postmark-exporter-script From a0caeb407c308e2d3a56a321d5bb2262b7ce6829 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:11:28 -0300 Subject: [PATCH 079/416] monitoring: dedupe grafana user via api --- .../monitoring/grafana-user-dedupe-job.yaml | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index b633a19..833eb70 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,8 +2,17 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe + name: grafana-user-dedupe-api namespace: monitoring + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: "monitoring" + vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" + vault.hashicorp.com/agent-inject-template-grafana-env.sh: | + {{ with secret "kv/data/atlas/monitoring/grafana-admin" }} + export GRAFANA_USER="{{ index .Data.data "admin-user" }}" + export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}" + {{ end }} spec: backoffLimit: 1 template: @@ -18,10 +27,15 @@ spec: args: - | set -euo pipefail - apk add --no-cache sqlite - db="/var/lib/grafana/grafana.db" - if [ ! -f "$db" ]; then - echo "grafana db not found at $db" + apk add --no-cache curl jq + . /vault/secrets/grafana-env.sh + grafana_url="${GRAFANA_URL}" + if [ -z "${grafana_url}" ]; then + echo "GRAFANA_URL is required" + exit 1 + fi + if [ -z "${GRAFANA_USER}" ] || [ -z "${GRAFANA_PASSWORD}" ]; then + echo "Grafana admin credentials missing" exit 1 fi if [ -z "${GRAFANA_DEDUPE_EMAILS}" ]; then @@ -29,23 +43,19 @@ spec: exit 1 fi for email in $(echo "${GRAFANA_DEDUPE_EMAILS}" | tr ',' ' '); do - ids="$(sqlite3 "$db" "select id from user where email = '${email}';")" - if [ -z "$ids" ]; then + user_id="$(curl -sf -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \ + "${grafana_url}/api/users/lookup?loginOrEmail=${email}" | jq -r '.id // empty')" + if [ -z "$user_id" ]; then echo "no grafana user found for ${email}" continue fi - echo "deleting grafana users with ids: ${ids}" - sqlite3 "$db" "delete from user_auth where user_id in (${ids});" - sqlite3 "$db" "delete from user where id in (${ids});" + echo "deleting grafana user ${user_id} (${email})" + curl -sf -X DELETE -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \ + "${grafana_url}/api/admin/users/${user_id}" done echo "done" env: + - name: GRAFANA_URL + value: http://grafana - name: GRAFANA_DEDUPE_EMAILS value: brad.stein@gmail.com,brad@bstein.dev - volumeMounts: - - name: grafana-storage - mountPath: /var/lib/grafana - volumes: - - name: grafana-storage - persistentVolumeClaim: - claimName: grafana From 08716c6be664ee2d04488bdfdc320e988ea4f4a7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:15:03 -0300 Subject: [PATCH 080/416] monitoring: use python dedupe job --- .../monitoring/grafana-user-dedupe-job.yaml | 51 ++++++++++++++----- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index 833eb70..f3a1c26 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api + name: grafana-user-dedupe-api-v2 namespace: monitoring annotations: vault.hashicorp.com/agent-inject: "true" @@ -20,14 +20,13 @@ spec: restartPolicy: Never containers: - name: dedupe - image: alpine:3.20 + image: python:3.12-slim command: - /bin/sh - -c args: - | set -euo pipefail - apk add --no-cache curl jq . /vault/secrets/grafana-env.sh grafana_url="${GRAFANA_URL}" if [ -z "${grafana_url}" ]; then @@ -42,17 +41,41 @@ spec: echo "GRAFANA_DEDUPE_EMAILS is required" exit 1 fi - for email in $(echo "${GRAFANA_DEDUPE_EMAILS}" | tr ',' ' '); do - user_id="$(curl -sf -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \ - "${grafana_url}/api/users/lookup?loginOrEmail=${email}" | jq -r '.id // empty')" - if [ -z "$user_id" ]; then - echo "no grafana user found for ${email}" - continue - fi - echo "deleting grafana user ${user_id} (${email})" - curl -sf -X DELETE -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \ - "${grafana_url}/api/admin/users/${user_id}" - done + python - <<'PY' + import base64 + import json + import os + import urllib.parse + import urllib.request + + grafana_url = os.environ["GRAFANA_URL"].rstrip("/") + user = os.environ["GRAFANA_USER"] + password = os.environ["GRAFANA_PASSWORD"] + emails = [e.strip() for e in os.environ["GRAFANA_DEDUPE_EMAILS"].split(",") if e.strip()] + + token = base64.b64encode(f"{user}:{password}".encode("utf-8")).decode("utf-8") + headers = {"Authorization": f"Basic {token}"} + + def request(method: str, url: str): + req = urllib.request.Request(url, headers=headers, method=method) + with urllib.request.urlopen(req, timeout=10) as resp: + return resp.read() + + for email in emails: + lookup_url = f"{grafana_url}/api/users/lookup?loginOrEmail={urllib.parse.quote(email)}" + try: + payload = json.loads(request("GET", lookup_url)) + except Exception: + print(f"no grafana user found for {email}") + continue + user_id = payload.get("id") + if not user_id: + print(f"no grafana user found for {email}") + continue + print(f"deleting grafana user {user_id} ({email})") + delete_url = f"{grafana_url}/api/admin/users/{user_id}" + request("DELETE", delete_url) + PY echo "done" env: - name: GRAFANA_URL From 88de0f7cee4dcd0b2c7686bc1f27cfe6632134ba Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:16:26 -0300 Subject: [PATCH 081/416] monitoring: wire vault sa for dedupe job --- services/monitoring/grafana-user-dedupe-job.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index f3a1c26..631c25d 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api-v2 + name: grafana-user-dedupe-api-v3 namespace: monitoring annotations: vault.hashicorp.com/agent-inject: "true" @@ -17,6 +17,8 @@ spec: backoffLimit: 1 template: spec: + serviceAccountName: monitoring-vault-sync + automountServiceAccountToken: true restartPolicy: Never containers: - name: dedupe From 4e65f02fbaef76b8ad0a00cc2bbbf0dcbdc87dae Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:18:57 -0300 Subject: [PATCH 082/416] monitoring: prepopulate vault for dedupe job --- .../monitoring/grafana-user-dedupe-job.yaml | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index 631c25d..3eb001b 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,20 +2,23 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api-v3 + name: grafana-user-dedupe-api-v4 namespace: monitoring - annotations: - vault.hashicorp.com/agent-inject: "true" - vault.hashicorp.com/role: "monitoring" - vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" - vault.hashicorp.com/agent-inject-template-grafana-env.sh: | - {{ with secret "kv/data/atlas/monitoring/grafana-admin" }} - export GRAFANA_USER="{{ index .Data.data "admin-user" }}" - export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}" - {{ end }} spec: backoffLimit: 1 template: + metadata: + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/agent-pre-populate: "true" + vault.hashicorp.com/agent-pre-populate-only: "true" + vault.hashicorp.com/role: "monitoring" + vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" + vault.hashicorp.com/agent-inject-template-grafana-env.sh: | + {{ with secret "kv/data/atlas/monitoring/grafana-admin" }} + export GRAFANA_USER="{{ index .Data.data "admin-user" }}" + export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}" + {{ end }} spec: serviceAccountName: monitoring-vault-sync automountServiceAccountToken: true From ae1fd5b6616761dcdbc2f437505c6d61559d5a9c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:25:53 -0300 Subject: [PATCH 083/416] monitoring: fix grafana user dedupe job --- services/monitoring/grafana-user-dedupe-job.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index 3eb001b..e56362b 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api-v4 + name: grafana-user-dedupe-api-v5 namespace: monitoring spec: backoffLimit: 1 @@ -10,7 +10,6 @@ spec: metadata: annotations: vault.hashicorp.com/agent-inject: "true" - vault.hashicorp.com/agent-pre-populate: "true" vault.hashicorp.com/agent-pre-populate-only: "true" vault.hashicorp.com/role: "monitoring" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" @@ -32,6 +31,16 @@ spec: args: - | set -euo pipefail + for _ in $(seq 1 30); do + if [ -f /vault/secrets/grafana-env.sh ]; then + break + fi + sleep 1 + done + if [ ! -f /vault/secrets/grafana-env.sh ]; then + echo "Vault secret not available" + exit 1 + fi . /vault/secrets/grafana-env.sh grafana_url="${GRAFANA_URL}" if [ -z "${grafana_url}" ]; then From 5ae6b4b00cba49572755b5ff2dd08ba94c8bc51f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:30:08 -0300 Subject: [PATCH 084/416] monitoring: harden grafana user dedupe --- .../monitoring/grafana-user-dedupe-job.yaml | 63 ++++++++++++++----- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index e56362b..1d1bd09 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api-v5 + name: grafana-user-dedupe-api-v6 namespace: monitoring spec: backoffLimit: 1 @@ -60,35 +60,66 @@ spec: import json import os import urllib.parse + import urllib.error import urllib.request grafana_url = os.environ["GRAFANA_URL"].rstrip("/") user = os.environ["GRAFANA_USER"] password = os.environ["GRAFANA_PASSWORD"] - emails = [e.strip() for e in os.environ["GRAFANA_DEDUPE_EMAILS"].split(",") if e.strip()] + lookups = [e.strip() for e in os.environ["GRAFANA_DEDUPE_EMAILS"].split(",") if e.strip()] token = base64.b64encode(f"{user}:{password}".encode("utf-8")).decode("utf-8") headers = {"Authorization": f"Basic {token}"} def request(method: str, url: str): req = urllib.request.Request(url, headers=headers, method=method) - with urllib.request.urlopen(req, timeout=10) as resp: - return resp.read() - - for email in emails: - lookup_url = f"{grafana_url}/api/users/lookup?loginOrEmail={urllib.parse.quote(email)}" try: - payload = json.loads(request("GET", lookup_url)) - except Exception: - print(f"no grafana user found for {email}") + with urllib.request.urlopen(req, timeout=10) as resp: + return resp.status, resp.read() + except urllib.error.HTTPError as err: + body = err.read() + return err.code, body + + for _ in range(60): + status, _ = request("GET", f"{grafana_url}/api/health") + if status == 200: + break + else: + raise SystemExit("Grafana API did not become ready in time") + + for lookup in lookups: + search_url = f"{grafana_url}/api/users/search?query={urllib.parse.quote(lookup)}" + status, body = request("GET", search_url) + if status != 200: + print(f"search failed for {lookup}: status={status} body={body.decode('utf-8', errors='ignore')}") continue - user_id = payload.get("id") - if not user_id: - print(f"no grafana user found for {email}") + payload = json.loads(body) + users = payload.get("users", []) + matches = [ + user + for user in users + if user.get("email", "").lower() == lookup.lower() + or user.get("login", "").lower() == lookup.lower() + ] + if not matches: + print(f"no grafana user found for {lookup}") continue - print(f"deleting grafana user {user_id} ({email})") - delete_url = f"{grafana_url}/api/admin/users/{user_id}" - request("DELETE", delete_url) + for user in matches: + user_id = user.get("id") + if not user_id: + continue + print(f"deleting grafana user {user_id} ({user.get('email')})") + delete_url = f"{grafana_url}/api/admin/users/{user_id}" + del_status, del_body = request("DELETE", delete_url) + if del_status not in (200, 202, 204): + print( + "delete failed for", + user_id, + "status", + del_status, + "body", + del_body.decode("utf-8", errors="ignore"), + ) PY echo "done" env: From 2e407e1962b5738b3486d572ac28a3fede020992 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:31:54 -0300 Subject: [PATCH 085/416] monitoring: reschedule grafana user dedupe --- .../monitoring/grafana-user-dedupe-job.yaml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index 1d1bd09..8ab1a66 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api-v6 + name: grafana-user-dedupe-api-v7 namespace: monitoring spec: backoffLimit: 1 @@ -22,6 +22,20 @@ spec: serviceAccountName: monitoring-vault-sync automountServiceAccountToken: true restartPolicy: Never + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: kubernetes.io/arch + operator: In + values: ["arm64"] containers: - name: dedupe image: python:3.12-slim @@ -124,6 +138,6 @@ spec: echo "done" env: - name: GRAFANA_URL - value: http://grafana + value: http://grafana.monitoring.svc.cluster.local - name: GRAFANA_DEDUPE_EMAILS value: brad.stein@gmail.com,brad@bstein.dev From 8b35ab02922bb80b5386dfcfa76f3c7b8fdc3d32 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 13:37:36 -0300 Subject: [PATCH 086/416] monitoring: refresh jobs dashboards --- scripts/dashboards_render_atlas.py | 330 ++++-- services/maintenance/ariadne-deployment.yaml | 8 +- .../{atlas-testing.json => atlas-jobs.json} | 1034 +++++++++------- .../monitoring/dashboards/atlas-overview.json | 284 ++++- ...sting.yaml => grafana-dashboard-jobs.yaml} | 1040 ++++++++++------- .../grafana-dashboard-overview.yaml | 284 ++++- services/monitoring/helmrelease.yaml | 6 +- services/monitoring/kustomization.yaml | 2 +- 8 files changed, 1946 insertions(+), 1042 deletions(-) rename services/monitoring/dashboards/{atlas-testing.json => atlas-jobs.json} (84%) rename services/monitoring/{grafana-dashboard-testing.yaml => grafana-dashboard-jobs.yaml} (84%) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 6eaafb4..1235a0a 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -337,16 +337,39 @@ GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' +ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))' +ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))' ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))' ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))' +ARIADNE_TASK_ATTEMPTS_1H = 'sum(increase(ariadne_task_runs_total[1h]))' +ARIADNE_TASK_FAILURES_1H = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' +ARIADNE_TEST_SUCCESS_RATE = ( + "100 * " + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[1h])) ' + "/ clamp_min(" + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[1h])), 1)' +) +ARIADNE_TEST_FAILURES_24H = ( + 'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' +) +ONEOFF_JOB_OWNER = ( + 'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")' +) +ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})' +ONEOFF_JOB_POD_AGE_HOURS = ( + '((time() - kube_pod_start_time{pod!=""}) / 3600) ' + f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} ' + '* on(namespace,pod) group_left(phase) ' + 'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})' +) GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -798,6 +821,15 @@ def build_overview(): {"color": "red", "value": 3}, ], } + age_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 6}, + {"color": "orange", "value": 24}, + {"color": "red", "value": 48}, + ], + } row1_stats = [ { @@ -1000,7 +1032,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 2, "w": 5, "x": 0, "y": 8}, + {"h": 3, "w": 5, "x": 0, "y": 8}, unit="none", links=link_to("atlas-mail"), ) @@ -1011,7 +1043,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 2, "w": 5, "x": 10, "y": 8}, + "gridPos": {"h": 3, "w": 5, "x": 10, "y": 8}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1057,7 +1089,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 2, "w": 5, "x": 5, "y": 8}, + {"h": 3, "w": 5, "x": 5, "y": 8}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1069,7 +1101,7 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 2, "w": 5, "x": 15, "y": 8}, + {"h": 3, "w": 5, "x": 15, "y": 8}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, @@ -1089,13 +1121,76 @@ def build_overview(): panel_id, title, expr, - {"h": 6, "w": 6, "x": 6 * idx, "y": 10}, + {"h": 5, "w": 6, "x": 6 * idx, "y": 11}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, links=link_to("atlas-storage"), ) ) + panels.append( + bargauge_panel( + 40, + "One-off Job Pods (age hours)", + ONEOFF_JOB_POD_AGE_HOURS, + {"h": 6, "w": 4, "x": 0, "y": 16}, + unit="h", + instant=True, + legend="{{namespace}}/{{pod}}", + thresholds=age_thresholds, + limit=8, + ) + ) + panels.append( + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": PROM_DS, + "gridPos": {"h": 6, "w": 8, "x": 4, "y": 16}, + "targets": [ + {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + ], + "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "options": { + "legend": {"displayMode": "table", "placement": "right"}, + "tooltip": {"mode": "multi"}, + }, + } + ) + panels.append( + timeseries_panel( + 42, + "Ariadne Test Success Rate", + ARIADNE_TEST_SUCCESS_RATE, + {"h": 6, "w": 8, "x": 12, "y": 16}, + unit="percent", + legend=None, + legend_display="list", + ) + ) + panels.append( + bargauge_panel( + 43, + "Tests with Failures (24h)", + ARIADNE_TEST_FAILURES_24H, + {"h": 6, "w": 4, "x": 20, "y": 16}, + unit="none", + instant=True, + legend="{{result}}", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 5}, + {"color": "red", "value": 10}, + ], + }, + ) + ) + cpu_scope = "$namespace_scope_cpu" gpu_scope = "$namespace_scope_gpu" ram_scope = "$namespace_scope_ram" @@ -1105,7 +1200,7 @@ def build_overview(): 11, "Namespace CPU Share", namespace_cpu_share_expr(cpu_scope), - {"h": 9, "w": 8, "x": 0, "y": 16}, + {"h": 9, "w": 8, "x": 0, "y": 22}, links=namespace_scope_links("namespace_scope_cpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1115,7 +1210,7 @@ def build_overview(): 12, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), - {"h": 9, "w": 8, "x": 8, "y": 16}, + {"h": 9, "w": 8, "x": 8, "y": 22}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1125,7 +1220,7 @@ def build_overview(): 13, "Namespace RAM Share", namespace_ram_share_expr(ram_scope), - {"h": 9, "w": 8, "x": 16, "y": 16}, + {"h": 9, "w": 8, "x": 16, "y": 22}, links=namespace_scope_links("namespace_scope_ram"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1137,7 +1232,7 @@ def build_overview(): 14, "Worker Node CPU", node_cpu_expr(worker_filter), - {"h": 12, "w": 12, "x": 0, "y": 32}, + {"h": 12, "w": 12, "x": 0, "y": 38}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1151,7 +1246,7 @@ def build_overview(): 15, "Worker Node RAM", node_mem_expr(worker_filter), - {"h": 12, "w": 12, "x": 12, "y": 32}, + {"h": 12, "w": 12, "x": 12, "y": 38}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1166,7 +1261,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 0, "y": 44}, + {"h": 10, "w": 12, "x": 0, "y": 50}, unit="percent", legend="{{node}}", legend_display="table", @@ -1178,7 +1273,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 12, "y": 44}, + {"h": 10, "w": 12, "x": 12, "y": 50}, unit="percent", legend="{{node}}", legend_display="table", @@ -1191,7 +1286,7 @@ def build_overview(): 28, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', - {"h": 10, "w": 12, "x": 0, "y": 54}, + {"h": 10, "w": 12, "x": 0, "y": 60}, ) ) panels.append( @@ -1199,7 +1294,7 @@ def build_overview(): 29, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', - {"h": 10, "w": 12, "x": 12, "y": 54}, + {"h": 10, "w": 12, "x": 12, "y": 60}, unit="none", limit=12, decimals=0, @@ -1221,7 +1316,7 @@ def build_overview(): 18, "Cluster Ingress Throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 8, "x": 0, "y": 25}, + {"h": 7, "w": 8, "x": 0, "y": 31}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", @@ -1234,7 +1329,7 @@ def build_overview(): 19, "Cluster Egress Throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 8, "x": 8, "y": 25}, + {"h": 7, "w": 8, "x": 8, "y": 31}, unit="Bps", legend="Egress (Traefik)", legend_display="list", @@ -1247,7 +1342,7 @@ def build_overview(): 20, "Intra-Cluster Throughput", NET_INTERNAL_EXPR, - {"h": 7, "w": 8, "x": 16, "y": 25}, + {"h": 7, "w": 8, "x": 16, "y": 31}, unit="Bps", legend="Internal traffic", legend_display="list", @@ -1261,7 +1356,7 @@ def build_overview(): 21, "Root Filesystem Usage", root_usage_expr(), - {"h": 16, "w": 12, "x": 0, "y": 64}, + {"h": 16, "w": 12, "x": 0, "y": 70}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1276,7 +1371,7 @@ def build_overview(): 22, "Nodes Closest to Full Root Disks", f"topk(12, {root_usage_expr()})", - {"h": 16, "w": 12, "x": 12, "y": 64}, + {"h": 16, "w": 12, "x": 12, "y": 70}, unit="percent", thresholds=PERCENT_THRESHOLDS, links=link_to("atlas-storage"), @@ -2171,7 +2266,7 @@ def build_mail_dashboard(): } -def build_testing_dashboard(): +def build_jobs_dashboard(): panels = [] age_thresholds = { "mode": "absolute", @@ -2192,12 +2287,65 @@ def build_testing_dashboard(): ], } + task_error_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 3}, + {"color": "red", "value": 5}, + ], + } + + panels.append( + bargauge_panel( + 1, + "Ariadne Task Errors (24h)", + ARIADNE_TASK_ERRORS_24H, + {"h": 7, "w": 6, "x": 0, "y": 0}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + { + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": PROM_DS, + "gridPos": {"h": 7, "w": 12, "x": 6, "y": 0}, + "targets": [ + {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + ], + "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "options": { + "legend": {"displayMode": "table", "placement": "right"}, + "tooltip": {"mode": "multi"}, + }, + } + ) + panels.append( + bargauge_panel( + 3, + "One-off Job Pods (age hours)", + ONEOFF_JOB_POD_AGE_HOURS, + {"h": 7, "w": 6, "x": 18, "y": 0}, + unit="h", + instant=True, + legend="{{namespace}}/{{pod}}", + thresholds=age_thresholds, + limit=12, + ) + ) panels.append( stat_panel( - 1, + 4, "Glue Jobs Stale (>36h)", GLUE_STALE_COUNT, - {"h": 4, "w": 6, "x": 0, "y": 0}, + {"h": 4, "w": 4, "x": 0, "y": 7}, unit="none", thresholds={ "mode": "absolute", @@ -2212,99 +2360,47 @@ def build_testing_dashboard(): ) panels.append( stat_panel( - 2, + 5, "Glue Jobs Missing Success", GLUE_MISSING_COUNT, - {"h": 4, "w": 4, "x": 4, "y": 0}, - unit="none", - ) - ) - panels.append( - stat_panel( - 3, - "Glue Jobs Suspended", - GLUE_SUSPENDED_COUNT, - {"h": 4, "w": 4, "x": 8, "y": 0}, - unit="none", - ) - ) - panels.append( - stat_panel( - 4, - "Ariadne Task Errors (1h)", - ARIADNE_TASK_ERRORS_1H_TOTAL, - {"h": 4, "w": 4, "x": 12, "y": 0}, - unit="none", - ) - ) - panels.append( - stat_panel( - 5, - "Ariadne Task Errors (24h)", - ARIADNE_TASK_ERRORS_24H_TOTAL, - {"h": 4, "w": 4, "x": 16, "y": 0}, + {"h": 4, "w": 4, "x": 4, "y": 7}, unit="none", ) ) panels.append( stat_panel( 6, - "Ariadne Task Runs (1h)", - ARIADNE_TASK_RUNS_1H_TOTAL, - {"h": 4, "w": 4, "x": 20, "y": 0}, + "Glue Jobs Suspended", + GLUE_SUSPENDED_COUNT, + {"h": 4, "w": 4, "x": 8, "y": 7}, unit="none", ) ) panels.append( - timeseries_panel( + stat_panel( 7, - "Ariadne Task Runs vs Errors (1h)", - ARIADNE_TASK_RUNS_BY_STATUS_1H, - {"h": 6, "w": 24, "x": 0, "y": 4}, + "Ariadne Task Errors (1h)", + ARIADNE_TASK_ERRORS_1H_TOTAL, + {"h": 4, "w": 4, "x": 12, "y": 7}, unit="none", - legend="{{status}}", - legend_display="table", - legend_placement="right", ) ) panels.append( - bargauge_panel( + stat_panel( 8, "Ariadne Task Errors (24h)", - ARIADNE_TASK_ERRORS_24H, - {"h": 8, "w": 12, "x": 0, "y": 10}, + ARIADNE_TASK_ERRORS_24H_TOTAL, + {"h": 4, "w": 4, "x": 16, "y": 7}, unit="none", - instant=True, - legend="{{task}}", - thresholds={ - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 1}, - {"color": "orange", "value": 3}, - {"color": "red", "value": 5}, - ], - }, ) ) panels.append( - bargauge_panel( + stat_panel( 9, - "Ariadne Task Success (24h)", - ARIADNE_TASK_SUCCESS_24H, - {"h": 8, "w": 12, "x": 12, "y": 10}, + "Ariadne Task Runs (1h)", + ARIADNE_TASK_RUNS_1H_TOTAL, + {"h": 4, "w": 4, "x": 20, "y": 7}, unit="none", - instant=True, - legend="{{task}}", - thresholds={ - "mode": "absolute", - "steps": [ - {"color": "red", "value": None}, - {"color": "orange", "value": 1}, - {"color": "yellow", "value": 5}, - {"color": "green", "value": 10}, - ], - }, ) ) panels.append( @@ -2312,7 +2408,7 @@ def build_testing_dashboard(): 10, "Ariadne Schedule Last Error (hours ago)", ARIADNE_SCHEDULE_LAST_ERROR_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 18}, + {"h": 8, "w": 12, "x": 0, "y": 11}, unit="h", instant=True, legend="{{task}}", @@ -2324,7 +2420,7 @@ def build_testing_dashboard(): 11, "Ariadne Schedule Last Success (hours ago)", ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 18}, + {"h": 8, "w": 12, "x": 12, "y": 11}, unit="h", instant=True, legend="{{task}}", @@ -2336,7 +2432,7 @@ def build_testing_dashboard(): 12, "Glue Jobs Last Success (hours ago)", GLUE_LAST_SUCCESS_AGE_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 26}, + {"h": 8, "w": 12, "x": 0, "y": 19}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", @@ -2348,7 +2444,7 @@ def build_testing_dashboard(): 13, "Glue Jobs Last Schedule (hours ago)", GLUE_LAST_SCHEDULE_AGE_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 26}, + {"h": 8, "w": 12, "x": 12, "y": 19}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", @@ -2358,9 +2454,33 @@ def build_testing_dashboard(): panels.append( bargauge_panel( 14, + "Ariadne Task Errors (1h)", + ARIADNE_TASK_ERRORS_1H, + {"h": 8, "w": 12, "x": 0, "y": 27}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + bargauge_panel( + 15, + "Ariadne Task Errors (30d)", + ARIADNE_TASK_ERRORS_30D, + {"h": 8, "w": 12, "x": 12, "y": 27}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + bargauge_panel( + 16, "Ariadne Access Requests", ARIADNE_ACCESS_REQUESTS, - {"h": 6, "w": 8, "x": 0, "y": 34}, + {"h": 6, "w": 8, "x": 0, "y": 35}, unit="none", instant=True, legend="{{status}}", @@ -2368,10 +2488,10 @@ def build_testing_dashboard(): ) panels.append( stat_panel( - 15, + 17, "Ariadne CI Coverage (%)", ARIADNE_CI_COVERAGE, - {"h": 6, "w": 4, "x": 8, "y": 34}, + {"h": 6, "w": 4, "x": 8, "y": 35}, unit="percent", decimals=1, instant=True, @@ -2380,10 +2500,10 @@ def build_testing_dashboard(): ) panels.append( table_panel( - 16, + 18, "Ariadne CI Tests (latest)", ARIADNE_CI_TESTS, - {"h": 6, "w": 12, "x": 12, "y": 34}, + {"h": 6, "w": 12, "x": 12, "y": 35}, unit="none", transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], instant=True, @@ -2391,8 +2511,8 @@ def build_testing_dashboard(): ) return { - "uid": "atlas-testing", - "title": "Atlas Testing", + "uid": "atlas-jobs", + "title": "Atlas Jobs", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, @@ -2400,7 +2520,7 @@ def build_testing_dashboard(): "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", - "tags": ["atlas", "testing"], + "tags": ["atlas", "jobs", "glue"], } @@ -2497,9 +2617,9 @@ DASHBOARDS = { "builder": build_mail_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml", }, - "atlas-testing": { - "builder": build_testing_dashboard, - "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml", + "atlas-jobs": { + "builder": build_jobs_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml", }, "atlas-gpu": { "builder": build_gpu_dashboard, diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 069f388..01e940c 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -270,7 +270,7 @@ spec: - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE value: "30 4 * * *" - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC - value: "*/15 * * * *" + value: "0 * * * *" - name: ARIADNE_SCHEDULE_WGER_USER_SYNC value: "0 5 * * *" - name: ARIADNE_SCHEDULE_WGER_ADMIN @@ -286,11 +286,11 @@ spec: - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER value: "30 4 * * 0" - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH - value: "*/15 * * * *" + value: "0 * * * *" - name: ARIADNE_SCHEDULE_VAULT_OIDC - value: "*/15 * * * *" + value: "0 * * * *" - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME - value: "*/1 * * * *" + value: "*/5 * * * *" - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE value: "*/30 * * * *" - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-jobs.json similarity index 84% rename from services/monitoring/dashboards/atlas-testing.json rename to services/monitoring/dashboards/atlas-jobs.json index 420abf2..76e21f0 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -1,416 +1,11 @@ { - "uid": "atlas-testing", - "title": "Atlas Testing", + "uid": "atlas-jobs", + "title": "Atlas Jobs", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, - "type": "stat", - "title": "Glue Jobs Stale (>36h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 2, - "type": "stat", - "title": "Glue Jobs Missing Success", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 0 - }, - "targets": [ - { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 3, - "type": "stat", - "title": "Glue Jobs Suspended", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 0 - }, - "targets": [ - { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 4, - "type": "stat", - "title": "Ariadne Task Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 5, - "type": "stat", - "title": "Ariadne Task Errors (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 16, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 6, - "type": "stat", - "title": "Ariadne Task Runs (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, - "type": "timeseries", - "title": "Ariadne Task Runs vs Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 4 - }, - "targets": [ - { - "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))", - "refId": "A", - "legendFormat": "{{status}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 8, "type": "bargauge", "title": "Ariadne Task Errors (24h)", "datasource": { @@ -418,10 +13,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, + "h": 7, + "w": 6, "x": 0, - "y": 10 + "y": 0 }, "targets": [ { @@ -484,50 +79,92 @@ ] }, { - "id": 9, - "type": "bargauge", - "title": "Ariadne Task Success (24h)", + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 10 + "x": 6, + "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))", + "expr": "sum(increase(ariadne_task_runs_total[1h]))", "refId": "A", - "legendFormat": "{{task}}", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", "instant": true } ], "fieldConfig": { "defaults": { - "unit": "none", + "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { - "color": "red", + "color": "green", "value": null }, - { - "color": "orange", - "value": 1 - }, { "color": "yellow", - "value": 5 + "value": 6 }, { - "color": "green", - "value": 10 + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 } ] } @@ -554,9 +191,383 @@ ], "order": "desc" } + }, + { + "id": "limit", + "options": { + "limit": 12 + } } ] }, + { + "id": 4, + "type": "stat", + "title": "Glue Jobs Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 7 + }, + "targets": [ + { + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 7 + }, + "targets": [ + { + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 7 + }, + "targets": [ + { + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 8, + "type": "stat", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 9, + "type": "stat", + "title": "Ariadne Task Runs (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 10, "type": "bargauge", @@ -569,7 +580,7 @@ "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 11 }, "targets": [ { @@ -643,7 +654,7 @@ "h": 8, "w": 12, "x": 12, - "y": 18 + "y": 11 }, "targets": [ { @@ -717,7 +728,7 @@ "h": 8, "w": 12, "x": 0, - "y": 26 + "y": 19 }, "targets": [ { @@ -791,7 +802,7 @@ "h": 8, "w": 12, "x": 12, - "y": 26 + "y": 19 }, "targets": [ { @@ -856,6 +867,154 @@ { "id": 14, "type": "bargauge", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, + "type": "bargauge", + "title": "Ariadne Task Errors (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 16, + "type": "bargauge", "title": "Ariadne Access Requests", "datasource": { "type": "prometheus", @@ -865,7 +1024,7 @@ "h": 6, "w": 8, "x": 0, - "y": 34 + "y": 35 }, "targets": [ { @@ -928,7 +1087,7 @@ ] }, { - "id": 15, + "id": 17, "type": "stat", "title": "Ariadne CI Coverage (%)", "datasource": { @@ -939,7 +1098,7 @@ "h": 6, "w": 4, "x": 8, - "y": 34 + "y": 35 }, "targets": [ { @@ -991,7 +1150,7 @@ } }, { - "id": 16, + "id": 18, "type": "table", "title": "Ariadne CI Tests (latest)", "datasource": { @@ -1002,7 +1161,7 @@ "h": 6, "w": 12, "x": 12, - "y": 34 + "y": 35 }, "targets": [ { @@ -1052,6 +1211,7 @@ "style": "dark", "tags": [ "atlas", - "testing" + "jobs", + "glue" ] } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index c5f30d1..c3ff327 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -795,7 +795,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 0, "y": 8 @@ -862,7 +862,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 10, "y": 8 @@ -967,7 +967,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 5, "y": 8 @@ -1043,7 +1043,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 15, "y": 8 @@ -1119,10 +1119,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 0, - "y": 10 + "y": 11 }, "targets": [ { @@ -1194,10 +1194,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 6, - "y": 10 + "y": 11 }, "targets": [ { @@ -1269,10 +1269,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 12, - "y": 10 + "y": 11 }, "targets": [ { @@ -1336,10 +1336,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 18, - "y": 10 + "y": 11 }, "targets": [ { @@ -1394,6 +1394,238 @@ } ] }, + { + "id": 40, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 8 + } + } + ] + }, + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 4, + "y": 16 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 42, + "type": "timeseries", + "title": "Ariadne Test Success Rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 12, + "y": 16 + }, + "targets": [ + { + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 43, + "type": "bargauge", + "title": "Tests with Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 20, + "y": 16 + }, + "targets": [ + { + "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "refId": "A", + "legendFormat": "{{result}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 11, "type": "piechart", @@ -1406,7 +1638,7 @@ "h": 9, "w": 8, "x": 0, - "y": 16 + "y": 22 }, "targets": [ { @@ -1475,7 +1707,7 @@ "h": 9, "w": 8, "x": 8, - "y": 16 + "y": 22 }, "targets": [ { @@ -1544,7 +1776,7 @@ "h": 9, "w": 8, "x": 16, - "y": 16 + "y": 22 }, "targets": [ { @@ -1613,7 +1845,7 @@ "h": 12, "w": 12, "x": 0, - "y": 32 + "y": 38 }, "targets": [ { @@ -1660,7 +1892,7 @@ "h": 12, "w": 12, "x": 12, - "y": 32 + "y": 38 }, "targets": [ { @@ -1707,7 +1939,7 @@ "h": 10, "w": 12, "x": 0, - "y": 44 + "y": 50 }, "targets": [ { @@ -1744,7 +1976,7 @@ "h": 10, "w": 12, "x": 12, - "y": 44 + "y": 50 }, "targets": [ { @@ -1781,7 +2013,7 @@ "h": 10, "w": 12, "x": 0, - "y": 54 + "y": 60 }, "targets": [ { @@ -1832,7 +2064,7 @@ "h": 10, "w": 12, "x": 12, - "y": 54 + "y": 60 }, "targets": [ { @@ -1913,7 +2145,7 @@ "h": 7, "w": 8, "x": 0, - "y": 25 + "y": 31 }, "targets": [ { @@ -1957,7 +2189,7 @@ "h": 7, "w": 8, "x": 8, - "y": 25 + "y": 31 }, "targets": [ { @@ -2001,7 +2233,7 @@ "h": 7, "w": 8, "x": 16, - "y": 25 + "y": 31 }, "targets": [ { @@ -2045,7 +2277,7 @@ "h": 16, "w": 12, "x": 0, - "y": 64 + "y": 70 }, "targets": [ { @@ -2093,7 +2325,7 @@ "h": 16, "w": 12, "x": 12, - "y": 64 + "y": 70 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-jobs.yaml similarity index 84% rename from services/monitoring/grafana-dashboard-testing.yaml rename to services/monitoring/grafana-dashboard-jobs.yaml index 52b2836..19e0d4e 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -1,425 +1,20 @@ -# services/monitoring/grafana-dashboard-testing.yaml +# services/monitoring/grafana-dashboard-jobs.yaml apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboard-testing + name: grafana-dashboard-jobs labels: grafana_dashboard: "1" data: - atlas-testing.json: | + atlas-jobs.json: | { - "uid": "atlas-testing", - "title": "Atlas Testing", + "uid": "atlas-jobs", + "title": "Atlas Jobs", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, - "type": "stat", - "title": "Glue Jobs Stale (>36h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 2, - "type": "stat", - "title": "Glue Jobs Missing Success", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 0 - }, - "targets": [ - { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 3, - "type": "stat", - "title": "Glue Jobs Suspended", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 0 - }, - "targets": [ - { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 4, - "type": "stat", - "title": "Ariadne Task Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 5, - "type": "stat", - "title": "Ariadne Task Errors (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 16, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 6, - "type": "stat", - "title": "Ariadne Task Runs (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, - "type": "timeseries", - "title": "Ariadne Task Runs vs Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 4 - }, - "targets": [ - { - "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))", - "refId": "A", - "legendFormat": "{{status}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 8, "type": "bargauge", "title": "Ariadne Task Errors (24h)", "datasource": { @@ -427,10 +22,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, + "h": 7, + "w": 6, "x": 0, - "y": 10 + "y": 0 }, "targets": [ { @@ -493,50 +88,92 @@ data: ] }, { - "id": 9, - "type": "bargauge", - "title": "Ariadne Task Success (24h)", + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 10 + "x": 6, + "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))", + "expr": "sum(increase(ariadne_task_runs_total[1h]))", "refId": "A", - "legendFormat": "{{task}}", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", "instant": true } ], "fieldConfig": { "defaults": { - "unit": "none", + "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { - "color": "red", + "color": "green", "value": null }, - { - "color": "orange", - "value": 1 - }, { "color": "yellow", - "value": 5 + "value": 6 }, { - "color": "green", - "value": 10 + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 } ] } @@ -563,9 +200,383 @@ data: ], "order": "desc" } + }, + { + "id": "limit", + "options": { + "limit": 12 + } } ] }, + { + "id": 4, + "type": "stat", + "title": "Glue Jobs Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 7 + }, + "targets": [ + { + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 7 + }, + "targets": [ + { + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 7 + }, + "targets": [ + { + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 8, + "type": "stat", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 9, + "type": "stat", + "title": "Ariadne Task Runs (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 10, "type": "bargauge", @@ -578,7 +589,7 @@ data: "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 11 }, "targets": [ { @@ -652,7 +663,7 @@ data: "h": 8, "w": 12, "x": 12, - "y": 18 + "y": 11 }, "targets": [ { @@ -726,7 +737,7 @@ data: "h": 8, "w": 12, "x": 0, - "y": 26 + "y": 19 }, "targets": [ { @@ -800,7 +811,7 @@ data: "h": 8, "w": 12, "x": 12, - "y": 26 + "y": 19 }, "targets": [ { @@ -865,6 +876,154 @@ data: { "id": 14, "type": "bargauge", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, + "type": "bargauge", + "title": "Ariadne Task Errors (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 16, + "type": "bargauge", "title": "Ariadne Access Requests", "datasource": { "type": "prometheus", @@ -874,7 +1033,7 @@ data: "h": 6, "w": 8, "x": 0, - "y": 34 + "y": 35 }, "targets": [ { @@ -937,7 +1096,7 @@ data: ] }, { - "id": 15, + "id": 17, "type": "stat", "title": "Ariadne CI Coverage (%)", "datasource": { @@ -948,7 +1107,7 @@ data: "h": 6, "w": 4, "x": 8, - "y": 34 + "y": 35 }, "targets": [ { @@ -1000,7 +1159,7 @@ data: } }, { - "id": 16, + "id": 18, "type": "table", "title": "Ariadne CI Tests (latest)", "datasource": { @@ -1011,7 +1170,7 @@ data: "h": 6, "w": 12, "x": 12, - "y": 34 + "y": 35 }, "targets": [ { @@ -1061,6 +1220,7 @@ data: "style": "dark", "tags": [ "atlas", - "testing" + "jobs", + "glue" ] } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 8ad7523..45969cc 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -804,7 +804,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 0, "y": 8 @@ -871,7 +871,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 10, "y": 8 @@ -976,7 +976,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 5, "y": 8 @@ -1052,7 +1052,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 15, "y": 8 @@ -1128,10 +1128,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 0, - "y": 10 + "y": 11 }, "targets": [ { @@ -1203,10 +1203,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 6, - "y": 10 + "y": 11 }, "targets": [ { @@ -1278,10 +1278,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 12, - "y": 10 + "y": 11 }, "targets": [ { @@ -1345,10 +1345,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 18, - "y": 10 + "y": 11 }, "targets": [ { @@ -1403,6 +1403,238 @@ data: } ] }, + { + "id": 40, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 8 + } + } + ] + }, + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 4, + "y": 16 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 42, + "type": "timeseries", + "title": "Ariadne Test Success Rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 12, + "y": 16 + }, + "targets": [ + { + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 43, + "type": "bargauge", + "title": "Tests with Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 20, + "y": 16 + }, + "targets": [ + { + "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "refId": "A", + "legendFormat": "{{result}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 11, "type": "piechart", @@ -1415,7 +1647,7 @@ data: "h": 9, "w": 8, "x": 0, - "y": 16 + "y": 22 }, "targets": [ { @@ -1484,7 +1716,7 @@ data: "h": 9, "w": 8, "x": 8, - "y": 16 + "y": 22 }, "targets": [ { @@ -1553,7 +1785,7 @@ data: "h": 9, "w": 8, "x": 16, - "y": 16 + "y": 22 }, "targets": [ { @@ -1622,7 +1854,7 @@ data: "h": 12, "w": 12, "x": 0, - "y": 32 + "y": 38 }, "targets": [ { @@ -1669,7 +1901,7 @@ data: "h": 12, "w": 12, "x": 12, - "y": 32 + "y": 38 }, "targets": [ { @@ -1716,7 +1948,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 44 + "y": 50 }, "targets": [ { @@ -1753,7 +1985,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 44 + "y": 50 }, "targets": [ { @@ -1790,7 +2022,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 54 + "y": 60 }, "targets": [ { @@ -1841,7 +2073,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 54 + "y": 60 }, "targets": [ { @@ -1922,7 +2154,7 @@ data: "h": 7, "w": 8, "x": 0, - "y": 25 + "y": 31 }, "targets": [ { @@ -1966,7 +2198,7 @@ data: "h": 7, "w": 8, "x": 8, - "y": 25 + "y": 31 }, "targets": [ { @@ -2010,7 +2242,7 @@ data: "h": 7, "w": 8, "x": 16, - "y": 25 + "y": 31 }, "targets": [ { @@ -2054,7 +2286,7 @@ data: "h": 16, "w": 12, "x": 0, - "y": 64 + "y": 70 }, "targets": [ { @@ -2102,7 +2334,7 @@ data: "h": 16, "w": 12, "x": 12, - "y": 64 + "y": 70 }, "targets": [ { diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 02bc482..ac24f8a 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -471,14 +471,14 @@ spec: editable: true options: path: /var/lib/grafana/dashboards/mail - - name: testing + - name: jobs orgId: 1 folder: Atlas Internal type: file disableDeletion: false editable: true options: - path: /var/lib/grafana/dashboards/testing + path: /var/lib/grafana/dashboards/jobs dashboardsConfigMaps: overview: grafana-dashboard-overview overview-public: grafana-dashboard-overview @@ -488,7 +488,7 @@ spec: gpu: grafana-dashboard-gpu network: grafana-dashboard-network mail: grafana-dashboard-mail - testing: grafana-dashboard-testing + jobs: grafana-dashboard-jobs extraConfigmapMounts: - name: grafana-folders mountPath: /etc/grafana/provisioning/folders diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 86ab826..5953039 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -14,7 +14,7 @@ resources: - grafana-dashboard-network.yaml - grafana-dashboard-gpu.yaml - grafana-dashboard-mail.yaml - - grafana-dashboard-testing.yaml + - grafana-dashboard-jobs.yaml - dcgm-exporter.yaml - jetson-tegrastats-exporter.yaml - postmark-exporter-service.yaml From 2138b93242196d19fb3b3ca45f7d93b2b2a674eb Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 16:40:09 +0000 Subject: [PATCH 087/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 05f3be2..6cb2acd 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-16 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-17 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From b0996e9a4fd67e0d3c93ac144738434d6d922ae1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 14:30:55 -0300 Subject: [PATCH 088/416] monitoring: refine jobs/overview panels --- scripts/dashboards_render_atlas.py | 162 ++++++++++++------ .../monitoring/dashboards/atlas-jobs.json | 119 ++++++++----- .../monitoring/dashboards/atlas-nodes.json | 4 +- .../monitoring/dashboards/atlas-overview.json | 135 +++++++++------ .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/grafana-dashboard-jobs.yaml | 119 ++++++++----- .../monitoring/grafana-dashboard-nodes.yaml | 4 +- .../grafana-dashboard-overview.yaml | 135 +++++++++------ .../monitoring/grafana-dashboard-pods.yaml | 2 +- 9 files changed, 446 insertions(+), 236 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 1235a0a..3d581c7 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -70,6 +70,7 @@ WORKER_NODES = [ "titan-13", "titan-14", "titan-15", + "titan-16", "titan-17", "titan-18", "titan-19", @@ -333,9 +334,10 @@ GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})" GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)" GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})" GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})" -GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" -GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" -GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" +GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)" +GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)" +GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)" +ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))' ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))' @@ -344,10 +346,19 @@ ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_to ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))' -ARIADNE_TASK_ATTEMPTS_1H = 'sum(increase(ariadne_task_runs_total[1h]))' -ARIADNE_TASK_FAILURES_1H = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' +ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))' +ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))' +ARIADNE_TASK_WARNINGS_SERIES = ( + 'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)' +) ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" +ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = ( + "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600" +) +ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = ( + "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600" +) ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' @@ -370,6 +381,8 @@ ONEOFF_JOB_POD_AGE_HOURS = ( '* on(namespace,pod) group_left(phase) ' 'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})' ) +GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600" +GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600" GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -1032,7 +1045,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 3, "w": 5, "x": 0, "y": 8}, + {"h": 3, "w": 6, "x": 0, "y": 8}, unit="none", links=link_to("atlas-mail"), ) @@ -1043,7 +1056,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 3, "w": 5, "x": 10, "y": 8}, + "gridPos": {"h": 3, "w": 6, "x": 12, "y": 8}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1089,7 +1102,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 3, "w": 5, "x": 5, "y": 8}, + {"h": 3, "w": 6, "x": 6, "y": 8}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1101,7 +1114,7 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 3, "w": 5, "x": 15, "y": 8}, + {"h": 3, "w": 6, "x": 18, "y": 8}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, @@ -1121,7 +1134,7 @@ def build_overview(): panel_id, title, expr, - {"h": 5, "w": 6, "x": 6 * idx, "y": 11}, + {"h": 3, "w": 6, "x": 6 * idx, "y": 11}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, links=link_to("atlas-storage"), @@ -1133,26 +1146,44 @@ def build_overview(): 40, "One-off Job Pods (age hours)", ONEOFF_JOB_POD_AGE_HOURS, - {"h": 6, "w": 4, "x": 0, "y": 16}, + {"h": 6, "w": 6, "x": 0, "y": 14}, unit="h", instant=True, legend="{{namespace}}/{{pod}}", thresholds=age_thresholds, limit=8, + decimals=2, ) ) panels.append( { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": PROM_DS, - "gridPos": {"h": 6, "w": 8, "x": 4, "y": 16}, + "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14}, "targets": [ - {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, ], - "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "fieldConfig": { + "defaults": {"unit": "none"}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Warnings"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + ], + }, + { + "matcher": {"id": "byName", "options": "Failures"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}} + ], + }, + ], + }, "options": { "legend": {"displayMode": "table", "placement": "right"}, "tooltip": {"mode": "multi"}, @@ -1164,7 +1195,7 @@ def build_overview(): 42, "Ariadne Test Success Rate", ARIADNE_TEST_SUCCESS_RATE, - {"h": 6, "w": 8, "x": 12, "y": 16}, + {"h": 6, "w": 6, "x": 12, "y": 14}, unit="percent", legend=None, legend_display="list", @@ -1175,7 +1206,7 @@ def build_overview(): 43, "Tests with Failures (24h)", ARIADNE_TEST_FAILURES_24H, - {"h": 6, "w": 4, "x": 20, "y": 16}, + {"h": 6, "w": 6, "x": 18, "y": 14}, unit="none", instant=True, legend="{{result}}", @@ -1200,7 +1231,7 @@ def build_overview(): 11, "Namespace CPU Share", namespace_cpu_share_expr(cpu_scope), - {"h": 9, "w": 8, "x": 0, "y": 22}, + {"h": 9, "w": 8, "x": 0, "y": 20}, links=namespace_scope_links("namespace_scope_cpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1210,7 +1241,7 @@ def build_overview(): 12, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), - {"h": 9, "w": 8, "x": 8, "y": 22}, + {"h": 9, "w": 8, "x": 8, "y": 20}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1220,7 +1251,7 @@ def build_overview(): 13, "Namespace RAM Share", namespace_ram_share_expr(ram_scope), - {"h": 9, "w": 8, "x": 16, "y": 22}, + {"h": 9, "w": 8, "x": 16, "y": 20}, links=namespace_scope_links("namespace_scope_ram"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1232,7 +1263,7 @@ def build_overview(): 14, "Worker Node CPU", node_cpu_expr(worker_filter), - {"h": 12, "w": 12, "x": 0, "y": 38}, + {"h": 12, "w": 12, "x": 0, "y": 36}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1246,7 +1277,7 @@ def build_overview(): 15, "Worker Node RAM", node_mem_expr(worker_filter), - {"h": 12, "w": 12, "x": 12, "y": 38}, + {"h": 12, "w": 12, "x": 12, "y": 36}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1261,7 +1292,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 0, "y": 50}, + {"h": 10, "w": 12, "x": 0, "y": 48}, unit="percent", legend="{{node}}", legend_display="table", @@ -1273,7 +1304,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 12, "y": 50}, + {"h": 10, "w": 12, "x": 12, "y": 48}, unit="percent", legend="{{node}}", legend_display="table", @@ -1286,7 +1317,7 @@ def build_overview(): 28, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', - {"h": 10, "w": 12, "x": 0, "y": 60}, + {"h": 10, "w": 12, "x": 0, "y": 58}, ) ) panels.append( @@ -1294,7 +1325,7 @@ def build_overview(): 29, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', - {"h": 10, "w": 12, "x": 12, "y": 60}, + {"h": 10, "w": 12, "x": 12, "y": 58}, unit="none", limit=12, decimals=0, @@ -1316,7 +1347,7 @@ def build_overview(): 18, "Cluster Ingress Throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 8, "x": 0, "y": 31}, + {"h": 7, "w": 8, "x": 0, "y": 29}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", @@ -1329,7 +1360,7 @@ def build_overview(): 19, "Cluster Egress Throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 8, "x": 8, "y": 31}, + {"h": 7, "w": 8, "x": 8, "y": 29}, unit="Bps", legend="Egress (Traefik)", legend_display="list", @@ -1342,7 +1373,7 @@ def build_overview(): 20, "Intra-Cluster Throughput", NET_INTERNAL_EXPR, - {"h": 7, "w": 8, "x": 16, "y": 31}, + {"h": 7, "w": 8, "x": 16, "y": 29}, unit="Bps", legend="Internal traffic", legend_display="list", @@ -1356,7 +1387,7 @@ def build_overview(): 21, "Root Filesystem Usage", root_usage_expr(), - {"h": 16, "w": 12, "x": 0, "y": 70}, + {"h": 16, "w": 12, "x": 0, "y": 68}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1371,7 +1402,7 @@ def build_overview(): 22, "Nodes Closest to Full Root Disks", f"topk(12, {root_usage_expr()})", - {"h": 16, "w": 12, "x": 12, "y": 70}, + {"h": 16, "w": 12, "x": 12, "y": 68}, unit="percent", thresholds=PERCENT_THRESHOLDS, links=link_to("atlas-storage"), @@ -2300,9 +2331,9 @@ def build_jobs_dashboard(): panels.append( bargauge_panel( 1, - "Ariadne Task Errors (24h)", - ARIADNE_TASK_ERRORS_24H, - {"h": 7, "w": 6, "x": 0, "y": 0}, + "Ariadne Task Errors (range)", + ARIADNE_TASK_ERRORS_RANGE, + {"h": 7, "w": 8, "x": 0, "y": 0}, unit="none", instant=True, legend="{{task}}", @@ -2313,14 +2344,31 @@ def build_jobs_dashboard(): { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": PROM_DS, - "gridPos": {"h": 7, "w": 12, "x": 6, "y": 0}, + "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0}, "targets": [ - {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, ], - "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "fieldConfig": { + "defaults": {"unit": "none"}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Warnings"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + ], + }, + { + "matcher": {"id": "byName", "options": "Failures"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}} + ], + }, + ], + }, "options": { "legend": {"displayMode": "table", "placement": "right"}, "tooltip": {"mode": "multi"}, @@ -2332,12 +2380,13 @@ def build_jobs_dashboard(): 3, "One-off Job Pods (age hours)", ONEOFF_JOB_POD_AGE_HOURS, - {"h": 7, "w": 6, "x": 18, "y": 0}, + {"h": 7, "w": 8, "x": 16, "y": 0}, unit="h", instant=True, legend="{{namespace}}/{{pod}}", thresholds=age_thresholds, limit=12, + decimals=2, ) ) panels.append( @@ -2407,48 +2456,53 @@ def build_jobs_dashboard(): bargauge_panel( 10, "Ariadne Schedule Last Error (hours ago)", - ARIADNE_SCHEDULE_LAST_ERROR_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 11}, + ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS, + {"h": 6, "w": 12, "x": 0, "y": 17}, unit="h", instant=True, legend="{{task}}", thresholds=recent_error_thresholds, + sort_order="asc", + decimals=2, ) ) panels.append( bargauge_panel( 11, "Ariadne Schedule Last Success (hours ago)", - ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 11}, + ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS, + {"h": 6, "w": 12, "x": 12, "y": 17}, unit="h", instant=True, legend="{{task}}", thresholds=age_thresholds, + decimals=2, ) ) panels.append( bargauge_panel( 12, "Glue Jobs Last Success (hours ago)", - GLUE_LAST_SUCCESS_AGE_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 19}, + GLUE_LAST_SUCCESS_RANGE_HOURS, + {"h": 6, "w": 12, "x": 0, "y": 23}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", thresholds=age_thresholds, + decimals=2, ) ) panels.append( bargauge_panel( 13, "Glue Jobs Last Schedule (hours ago)", - GLUE_LAST_SCHEDULE_AGE_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 19}, + GLUE_LAST_SCHEDULE_RANGE_HOURS, + {"h": 6, "w": 12, "x": 12, "y": 23}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", thresholds=age_thresholds, + decimals=2, ) ) panels.append( @@ -2456,7 +2510,7 @@ def build_jobs_dashboard(): 14, "Ariadne Task Errors (1h)", ARIADNE_TASK_ERRORS_1H, - {"h": 8, "w": 12, "x": 0, "y": 27}, + {"h": 6, "w": 12, "x": 0, "y": 29}, unit="none", instant=True, legend="{{task}}", @@ -2468,7 +2522,7 @@ def build_jobs_dashboard(): 15, "Ariadne Task Errors (30d)", ARIADNE_TASK_ERRORS_30D, - {"h": 8, "w": 12, "x": 12, "y": 27}, + {"h": 6, "w": 12, "x": 12, "y": 29}, unit="none", instant=True, legend="{{task}}", @@ -2480,7 +2534,7 @@ def build_jobs_dashboard(): 16, "Ariadne Access Requests", ARIADNE_ACCESS_REQUESTS, - {"h": 6, "w": 8, "x": 0, "y": 35}, + {"h": 6, "w": 8, "x": 0, "y": 11}, unit="none", instant=True, legend="{{status}}", @@ -2491,7 +2545,7 @@ def build_jobs_dashboard(): 17, "Ariadne CI Coverage (%)", ARIADNE_CI_COVERAGE, - {"h": 6, "w": 4, "x": 8, "y": 35}, + {"h": 6, "w": 4, "x": 8, "y": 11}, unit="percent", decimals=1, instant=True, @@ -2503,7 +2557,7 @@ def build_jobs_dashboard(): 18, "Ariadne CI Tests (latest)", ARIADNE_CI_TESTS, - {"h": 6, "w": 12, "x": 12, "y": 35}, + {"h": 6, "w": 12, "x": 12, "y": 11}, unit="none", transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], instant=True, diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 76e21f0..c70e9c0 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -7,20 +7,20 @@ { "id": 1, "type": "bargauge", - "title": "Ariadne Task Errors (24h)", + "title": "Ariadne Task Errors (range)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 6, + "w": 8, "x": 0, "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -81,26 +81,31 @@ { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 12, - "x": 6, + "w": 8, + "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -108,7 +113,38 @@ "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -130,8 +166,8 @@ }, "gridPos": { "h": 7, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 0 }, "targets": [ @@ -167,7 +203,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -216,7 +253,7 @@ }, "targets": [ { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)", "refId": "A" } ], @@ -284,7 +321,7 @@ }, "targets": [ { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)", "refId": "A" } ], @@ -344,7 +381,7 @@ }, "targets": [ { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)", "refId": "A" } ], @@ -577,14 +614,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -615,7 +652,8 @@ "value": 24 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -637,7 +675,7 @@ "fields": [ "Value" ], - "order": "desc" + "order": "asc" } } ] @@ -651,14 +689,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -689,7 +727,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -725,14 +764,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -763,7 +802,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -799,14 +839,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -837,7 +877,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -873,10 +914,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 27 + "y": 29 }, "targets": [ { @@ -947,10 +988,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 27 + "y": 29 }, "targets": [ { @@ -1024,7 +1065,7 @@ "h": 6, "w": 8, "x": 0, - "y": 35 + "y": 11 }, "targets": [ { @@ -1098,7 +1139,7 @@ "h": 6, "w": 4, "x": 8, - "y": 35 + "y": 11 }, "targets": [ { @@ -1161,7 +1202,7 @@ "h": 6, "w": 12, "x": 12, - "y": 35 + "y": 11 }, "targets": [ { diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index 2d60042..ea59579 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -46,7 +46,7 @@ "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": "/19" + "valueSuffix": "/20" } }, "overrides": [] diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index c3ff327..5acc2a3 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -449,14 +449,14 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, - "max": 19, + "max": 20, "thresholds": { "mode": "absolute", "steps": [ @@ -466,15 +466,15 @@ }, { "color": "orange", - "value": 17 - }, - { - "color": "yellow", "value": 18 }, { - "color": "green", + "color": "yellow", "value": 19 + }, + { + "color": "green", + "value": 20 } ] } @@ -796,7 +796,7 @@ }, "gridPos": { "h": 3, - "w": 5, + "w": 6, "x": 0, "y": 8 }, @@ -863,8 +863,8 @@ }, "gridPos": { "h": 3, - "w": 5, - "x": 10, + "w": 6, + "x": 12, "y": 8 }, "targets": [ @@ -968,8 +968,8 @@ }, "gridPos": { "h": 3, - "w": 5, - "x": 5, + "w": 6, + "x": 6, "y": 8 }, "targets": [ @@ -1044,8 +1044,8 @@ }, "gridPos": { "h": 3, - "w": 5, - "x": 15, + "w": 6, + "x": 18, "y": 8 }, "targets": [ @@ -1119,7 +1119,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 0, "y": 11 @@ -1194,7 +1194,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 6, "y": 11 @@ -1269,7 +1269,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 12, "y": 11 @@ -1336,7 +1336,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 18, "y": 11 @@ -1404,9 +1404,9 @@ }, "gridPos": { "h": 6, - "w": 4, + "w": 6, "x": 0, - "y": 16 + "y": 14 }, "targets": [ { @@ -1441,7 +1441,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -1477,26 +1478,31 @@ { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, - "w": 8, - "x": 4, - "y": 16 + "w": 6, + "x": 6, + "y": 14 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -1504,7 +1510,38 @@ "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -1526,9 +1563,9 @@ }, "gridPos": { "h": 6, - "w": 8, + "w": 6, "x": 12, - "y": 16 + "y": 14 }, "targets": [ { @@ -1562,9 +1599,9 @@ }, "gridPos": { "h": 6, - "w": 4, - "x": 20, - "y": 16 + "w": 6, + "x": 18, + "y": 14 }, "targets": [ { @@ -1638,7 +1675,7 @@ "h": 9, "w": 8, "x": 0, - "y": 22 + "y": 20 }, "targets": [ { @@ -1707,7 +1744,7 @@ "h": 9, "w": 8, "x": 8, - "y": 22 + "y": 20 }, "targets": [ { @@ -1776,7 +1813,7 @@ "h": 9, "w": 8, "x": 16, - "y": 22 + "y": 20 }, "targets": [ { @@ -1845,11 +1882,11 @@ "h": 12, "w": 12, "x": 0, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1892,11 +1929,11 @@ "h": 12, "w": 12, "x": 12, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1939,7 +1976,7 @@ "h": 10, "w": 12, "x": 0, - "y": 50 + "y": 48 }, "targets": [ { @@ -1976,7 +2013,7 @@ "h": 10, "w": 12, "x": 12, - "y": 50 + "y": 48 }, "targets": [ { @@ -2013,7 +2050,7 @@ "h": 10, "w": 12, "x": 0, - "y": 60 + "y": 58 }, "targets": [ { @@ -2064,7 +2101,7 @@ "h": 10, "w": 12, "x": 12, - "y": 60 + "y": 58 }, "targets": [ { @@ -2145,7 +2182,7 @@ "h": 7, "w": 8, "x": 0, - "y": 31 + "y": 29 }, "targets": [ { @@ -2189,7 +2226,7 @@ "h": 7, "w": 8, "x": 8, - "y": 31 + "y": 29 }, "targets": [ { @@ -2233,7 +2270,7 @@ "h": 7, "w": 8, "x": 16, - "y": 31 + "y": 29 }, "targets": [ { @@ -2277,7 +2314,7 @@ "h": 16, "w": 12, "x": 0, - "y": 70 + "y": 68 }, "targets": [ { @@ -2325,7 +2362,7 @@ "h": 16, "w": 12, "x": 12, - "y": 70 + "y": 68 }, "targets": [ { diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index adab84b..e36aa1f 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -520,7 +520,7 @@ }, "targets": [ { - "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))", + "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))", "refId": "A", "instant": true, "format": "table" diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 19e0d4e..36c1252 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -16,20 +16,20 @@ data: { "id": 1, "type": "bargauge", - "title": "Ariadne Task Errors (24h)", + "title": "Ariadne Task Errors (range)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 6, + "w": 8, "x": 0, "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -90,26 +90,31 @@ data: { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 12, - "x": 6, + "w": 8, + "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -117,7 +122,38 @@ data: "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -139,8 +175,8 @@ data: }, "gridPos": { "h": 7, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 0 }, "targets": [ @@ -176,7 +212,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -225,7 +262,7 @@ data: }, "targets": [ { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)", "refId": "A" } ], @@ -293,7 +330,7 @@ data: }, "targets": [ { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)", "refId": "A" } ], @@ -353,7 +390,7 @@ data: }, "targets": [ { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)", "refId": "A" } ], @@ -586,14 +623,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -624,7 +661,8 @@ data: "value": 24 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -646,7 +684,7 @@ data: "fields": [ "Value" ], - "order": "desc" + "order": "asc" } } ] @@ -660,14 +698,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -698,7 +736,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -734,14 +773,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -772,7 +811,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -808,14 +848,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -846,7 +886,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -882,10 +923,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 27 + "y": 29 }, "targets": [ { @@ -956,10 +997,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 27 + "y": 29 }, "targets": [ { @@ -1033,7 +1074,7 @@ data: "h": 6, "w": 8, "x": 0, - "y": 35 + "y": 11 }, "targets": [ { @@ -1107,7 +1148,7 @@ data: "h": 6, "w": 4, "x": 8, - "y": 35 + "y": 11 }, "targets": [ { @@ -1170,7 +1211,7 @@ data: "h": 6, "w": 12, "x": 12, - "y": 35 + "y": 11 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index f0f1982..98123b9 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -55,7 +55,7 @@ data: "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": "/19" + "valueSuffix": "/20" } }, "overrides": [] diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 45969cc..55196e8 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -458,14 +458,14 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, - "max": 19, + "max": 20, "thresholds": { "mode": "absolute", "steps": [ @@ -475,15 +475,15 @@ data: }, { "color": "orange", - "value": 17 - }, - { - "color": "yellow", "value": 18 }, { - "color": "green", + "color": "yellow", "value": 19 + }, + { + "color": "green", + "value": 20 } ] } @@ -805,7 +805,7 @@ data: }, "gridPos": { "h": 3, - "w": 5, + "w": 6, "x": 0, "y": 8 }, @@ -872,8 +872,8 @@ data: }, "gridPos": { "h": 3, - "w": 5, - "x": 10, + "w": 6, + "x": 12, "y": 8 }, "targets": [ @@ -977,8 +977,8 @@ data: }, "gridPos": { "h": 3, - "w": 5, - "x": 5, + "w": 6, + "x": 6, "y": 8 }, "targets": [ @@ -1053,8 +1053,8 @@ data: }, "gridPos": { "h": 3, - "w": 5, - "x": 15, + "w": 6, + "x": 18, "y": 8 }, "targets": [ @@ -1128,7 +1128,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 0, "y": 11 @@ -1203,7 +1203,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 6, "y": 11 @@ -1278,7 +1278,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 12, "y": 11 @@ -1345,7 +1345,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 18, "y": 11 @@ -1413,9 +1413,9 @@ data: }, "gridPos": { "h": 6, - "w": 4, + "w": 6, "x": 0, - "y": 16 + "y": 14 }, "targets": [ { @@ -1450,7 +1450,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -1486,26 +1487,31 @@ data: { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, - "w": 8, - "x": 4, - "y": 16 + "w": 6, + "x": 6, + "y": 14 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -1513,7 +1519,38 @@ data: "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -1535,9 +1572,9 @@ data: }, "gridPos": { "h": 6, - "w": 8, + "w": 6, "x": 12, - "y": 16 + "y": 14 }, "targets": [ { @@ -1571,9 +1608,9 @@ data: }, "gridPos": { "h": 6, - "w": 4, - "x": 20, - "y": 16 + "w": 6, + "x": 18, + "y": 14 }, "targets": [ { @@ -1647,7 +1684,7 @@ data: "h": 9, "w": 8, "x": 0, - "y": 22 + "y": 20 }, "targets": [ { @@ -1716,7 +1753,7 @@ data: "h": 9, "w": 8, "x": 8, - "y": 22 + "y": 20 }, "targets": [ { @@ -1785,7 +1822,7 @@ data: "h": 9, "w": 8, "x": 16, - "y": 22 + "y": 20 }, "targets": [ { @@ -1854,11 +1891,11 @@ data: "h": 12, "w": 12, "x": 0, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1901,11 +1938,11 @@ data: "h": 12, "w": 12, "x": 12, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1948,7 +1985,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 50 + "y": 48 }, "targets": [ { @@ -1985,7 +2022,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 50 + "y": 48 }, "targets": [ { @@ -2022,7 +2059,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 60 + "y": 58 }, "targets": [ { @@ -2073,7 +2110,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 60 + "y": 58 }, "targets": [ { @@ -2154,7 +2191,7 @@ data: "h": 7, "w": 8, "x": 0, - "y": 31 + "y": 29 }, "targets": [ { @@ -2198,7 +2235,7 @@ data: "h": 7, "w": 8, "x": 8, - "y": 31 + "y": 29 }, "targets": [ { @@ -2242,7 +2279,7 @@ data: "h": 7, "w": 8, "x": 16, - "y": 31 + "y": 29 }, "targets": [ { @@ -2286,7 +2323,7 @@ data: "h": 16, "w": 12, "x": 0, - "y": 70 + "y": 68 }, "targets": [ { @@ -2334,7 +2371,7 @@ data: "h": 16, "w": 12, "x": 12, - "y": 70 + "y": 68 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index f537d4c..6273023 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -529,7 +529,7 @@ data: }, "targets": [ { - "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))", + "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))", "refId": "A", "instant": true, "format": "table" From db4c3b7c5105402e4a35642f39b71b3b62d1fef3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 15:01:02 -0300 Subject: [PATCH 089/416] monitoring: tighten jobs/overview ordering --- scripts/dashboards_render_atlas.py | 18 +++++++++- .../monitoring/dashboards/atlas-jobs.json | 2 +- .../monitoring/dashboards/atlas-overview.json | 36 +++++++++++++++++-- .../monitoring/grafana-dashboard-jobs.yaml | 2 +- .../grafana-dashboard-overview.yaml | 36 +++++++++++++++++-- 5 files changed, 87 insertions(+), 7 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 3d581c7..c3f3655 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -560,6 +560,7 @@ def timeseries_panel( grid, *, unit="none", + max_value=None, legend=None, legend_display="table", legend_placement="bottom", @@ -584,6 +585,8 @@ def timeseries_panel( "tooltip": {"mode": "multi"}, }, } + if max_value is not None: + panel["fieldConfig"]["defaults"]["max"] = max_value if legend: panel["targets"][0]["legendFormat"] = legend if legend_calcs: @@ -742,6 +745,7 @@ def bargauge_panel( thresholds=None, decimals=None, instant=False, + overrides=None, ): """Return a bar gauge panel with label-aware reduction.""" panel = { @@ -786,6 +790,8 @@ def bargauge_panel( }, }, } + if overrides: + panel["fieldConfig"]["overrides"].extend(overrides) if decimals is not None: panel["fieldConfig"]["defaults"]["decimals"] = decimals if links: @@ -1197,6 +1203,7 @@ def build_overview(): ARIADNE_TEST_SUCCESS_RATE, {"h": 6, "w": 6, "x": 12, "y": 14}, unit="percent", + max_value=100, legend=None, legend_display="list", ) @@ -1210,6 +1217,16 @@ def build_overview(): unit="none", instant=True, legend="{{result}}", + overrides=[ + { + "matcher": {"id": "byName", "options": "error"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}], + }, + { + "matcher": {"id": "byName", "options": "failed"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}], + }, + ], thresholds={ "mode": "absolute", "steps": [ @@ -2462,7 +2479,6 @@ def build_jobs_dashboard(): instant=True, legend="{{task}}", thresholds=recent_error_thresholds, - sort_order="asc", decimals=2, ) ) diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index c70e9c0..810b3b3 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -675,7 +675,7 @@ "fields": [ "Value" ], - "order": "asc" + "order": "desc" } } ] diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 5acc2a3..3feb531 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1575,7 +1575,8 @@ ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "max": 100 }, "overrides": [] }, @@ -1638,7 +1639,38 @@ ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "failed" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "displayMode": "gradient", diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 36c1252..279d959 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -684,7 +684,7 @@ data: "fields": [ "Value" ], - "order": "asc" + "order": "desc" } } ] diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 55196e8..66b6da0 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1584,7 +1584,8 @@ data: ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "max": 100 }, "overrides": [] }, @@ -1647,7 +1648,38 @@ data: ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "failed" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "displayMode": "gradient", From 4721d44a33139beb4aaeb23faf84dd5ad049b8b7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 15:12:53 -0300 Subject: [PATCH 090/416] monitoring: enforce sorted job lists --- scripts/dashboards_render_atlas.py | 24 ++++++++------ .../monitoring/dashboards/atlas-jobs.json | 31 ++++++++----------- .../monitoring/dashboards/atlas-overview.json | 21 +++++-------- .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/grafana-dashboard-jobs.yaml | 31 ++++++++----------- .../grafana-dashboard-overview.yaml | 21 +++++-------- .../monitoring/grafana-dashboard-pods.yaml | 2 +- 7 files changed, 58 insertions(+), 74 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index c3f3655..1f28489 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -748,6 +748,12 @@ def bargauge_panel( overrides=None, ): """Return a bar gauge panel with label-aware reduction.""" + cleaned_expr = expr.strip() + if not cleaned_expr.startswith(("sort(", "sort_desc(")): + if sort_order == "desc": + expr = f"sort_desc({expr})" + elif sort_order == "asc": + expr = f"sort({expr})" panel = { "id": panel_id, "type": "bargauge", @@ -1165,21 +1171,20 @@ def build_overview(): { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": PROM_DS, "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14}, "targets": [ {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, - {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, ], "fieldConfig": { "defaults": {"unit": "none"}, "overrides": [ { - "matcher": {"id": "byName", "options": "Warnings"}, + "matcher": {"id": "byName", "options": "Attempts"}, "properties": [ - {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}} ], }, { @@ -2361,21 +2366,20 @@ def build_jobs_dashboard(): { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": PROM_DS, "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0}, "targets": [ {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, - {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, ], "fieldConfig": { "defaults": {"unit": "none"}, "overrides": [ { - "matcher": {"id": "byName", "options": "Warnings"}, + "matcher": {"id": "byName", "options": "Attempts"}, "properties": [ - {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}} ], }, { diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 810b3b3..37b888d 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -81,7 +81,7 @@ { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -98,14 +98,9 @@ "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -117,14 +112,14 @@ { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -172,7 +167,7 @@ }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -621,7 +616,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -696,7 +691,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -771,7 +766,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -846,7 +841,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -921,7 +916,7 @@ }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -995,7 +990,7 @@ }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -1069,7 +1064,7 @@ }, "targets": [ { - "expr": "ariadne_access_requests_total", + "expr": "sort_desc(ariadne_access_requests_total)", "refId": "A", "legendFormat": "{{status}}", "instant": true diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 3feb531..78744da 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1410,7 +1410,7 @@ }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -1478,7 +1478,7 @@ { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1495,14 +1495,9 @@ "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -1514,14 +1509,14 @@ { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -1606,7 +1601,7 @@ }, "targets": [ { - "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", "refId": "A", "legendFormat": "{{result}}", "instant": true @@ -2137,7 +2132,7 @@ }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -2398,7 +2393,7 @@ }, "targets": [ { - "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index e36aa1f..0c8104c 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -439,7 +439,7 @@ }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 279d959..b16c9cb 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -90,7 +90,7 @@ data: { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -107,14 +107,9 @@ data: "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -126,14 +121,14 @@ data: { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -181,7 +176,7 @@ data: }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -630,7 +625,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -705,7 +700,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -780,7 +775,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -855,7 +850,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -930,7 +925,7 @@ data: }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -1004,7 +999,7 @@ data: }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -1078,7 +1073,7 @@ data: }, "targets": [ { - "expr": "ariadne_access_requests_total", + "expr": "sort_desc(ariadne_access_requests_total)", "refId": "A", "legendFormat": "{{status}}", "instant": true diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 66b6da0..fa19911 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1419,7 +1419,7 @@ data: }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -1487,7 +1487,7 @@ data: { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1504,14 +1504,9 @@ data: "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -1523,14 +1518,14 @@ data: { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -1615,7 +1610,7 @@ data: }, "targets": [ { - "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", "refId": "A", "legendFormat": "{{result}}", "instant": true @@ -2146,7 +2141,7 @@ data: }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -2407,7 +2402,7 @@ data: }, "targets": [ { - "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 6273023..1461eac 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -448,7 +448,7 @@ data: }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true From 298d261146697d78bc7f9a4daaea6d12d72b71f3 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 20:04:15 +0000 Subject: [PATCH 091/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 78f5e68..e43f30e 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 1039590b14bbe52445942ae9040b4a9c08be6aa7 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 20:05:15 +0000 Subject: [PATCH 092/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index e43f30e..ee57a11 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 66cb72947f2662c1f11c8f3f136182bdd30cbd81 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 20:33:18 +0000 Subject: [PATCH 093/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index ee57a11..6018087 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 3a987c29ff2546406ac2ed603b415a315043503c Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 20:34:18 +0000 Subject: [PATCH 094/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 6018087..87cb635 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 3a48569330d40a134cb14dcdfbc9ed335ea93ea6 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:05:29 +0000 Subject: [PATCH 095/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 87cb635..9d4896b 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From da16998d2ed295ae5ff8e4f785d418f7a953b24b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:07:29 +0000 Subject: [PATCH 096/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 9d4896b..8ba3cb0 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 30b86a693f9a2e5def34c1593e20efc8ae4f7a26 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:23:44 +0000 Subject: [PATCH 097/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 6cb2acd..9b78f34 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-17 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-18 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From d9cda5b6af2b12f1a967977ef03ffd9282c8ae94 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:30:31 +0000 Subject: [PATCH 098/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 8ba3cb0..36decfa 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 6f4e5dbfe7c237ddf0da19f2f5bb6d0eb9994e41 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:32:31 +0000 Subject: [PATCH 099/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 36decfa..9aa6d82 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From d3b1a925b8977b182a8e38fa16947806b8c30b86 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:52:46 +0000 Subject: [PATCH 100/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 9b78f34..6c5ff2e 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-18 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-19 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From eb5256e6bc6523884f5ac35c90ea9f9b782daa29 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:55:34 +0000 Subject: [PATCH 101/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 9aa6d82..52341a7 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 067134fa1b14e7d4571755f7bc8b4edcf9ec92a8 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:56:34 +0000 Subject: [PATCH 102/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 52341a7..e133abe 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 41d38033b520bc501dc8c82db701964546407960 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 23:24:37 +0000 Subject: [PATCH 103/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index e133abe..7e381ab 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 71122fc200634b7b80f49c96baa2d6ed075a5df9 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 23:24:40 +0000 Subject: [PATCH 104/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7e381ab..5868891 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 75e2c745f758b3b0a0afe55143ecc969848fa5d3 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 23:47:39 +0000 Subject: [PATCH 105/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 5868891..6f19514 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From d54115df5544b45f6c7cf4e2acb2ea2b3f5c9786 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 23:48:39 +0000 Subject: [PATCH 106/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 6f19514..94ccbce 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 592435f7604ff55aa30c47a7c45bf96147cadd50 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 00:16:42 +0000 Subject: [PATCH 107/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 94ccbce..e013792 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From aa3db22eaf6598339ee63b49aac6580ade286599 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 00:17:42 +0000 Subject: [PATCH 108/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index e013792..d4a8429 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From beb923cf0e70b4e74b658781337e8825812e8fe7 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 00:48:58 +0000 Subject: [PATCH 109/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 6c5ff2e..84759a4 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-19 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-20 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 3891f1d063fa3528f8f1c6e3472891058a020bca Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 00:59:59 +0000 Subject: [PATCH 110/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 84759a4..1f1c731 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-20 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From c84af0b8df1c2a55d2f761390e4e44cdaf57cecd Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 05:37:20 +0000 Subject: [PATCH 111/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d4a8429..db93333 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 6dcbdcf7045b1ae1592354219bff0a74e5e7a25f Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 05:38:20 +0000 Subject: [PATCH 112/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index db93333..8e945e0 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 62c0e32bc47c0e02c299149c0ede16c728217c83 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 05:40:21 +0000 Subject: [PATCH 113/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 8e945e0..bf79e8b 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 955bbcf58fe6e62dcd924c032674393679c62f03 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 05:41:20 +0000 Subject: [PATCH 114/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index bf79e8b..192ad7e 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From ba2b9acbcce3466bff7dde4d55054e43a9ed1579 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 03:15:19 -0300 Subject: [PATCH 115/416] jenkins: use shared harbor creds when present --- services/jenkins/deployment.yaml | 6 ++++++ services/vault/scripts/vault_k8s_auth_configure.sh | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 0b62ee0..0dc76af 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -34,6 +34,12 @@ spec: HARBOR_ROBOT_USERNAME={{ .Data.data.username }} HARBOR_ROBOT_PASSWORD={{ .Data.data.password }} {{ end }} + {{ with secret "kv/data/atlas/shared/harbor-pull" }} + {{- if and .Data.data.username .Data.data.password }} + HARBOR_ROBOT_USERNAME={{ .Data.data.username }} + HARBOR_ROBOT_PASSWORD={{ .Data.data.password }} + {{- end }} + {{ end }} {{ with secret "kv/data/atlas/jenkins/gitea-pat" }} GITEA_PAT_USERNAME={{ .Data.data.username }} GITEA_PAT_TOKEN={{ .Data.data.token }} diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index bc03cf4..00fa567 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -219,7 +219,7 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \ write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \ "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" "" write_policy_and_role "jenkins" "jenkins" "jenkins" \ - "jenkins/*" "" + "jenkins/* shared/harbor-pull" "" write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \ "monitoring/* shared/postmark-relay shared/harbor-pull" "" write_policy_and_role "logging" "logging" "logging-vault-sync" \ From 94953ab0fe9bdb9b708baaa48a73491f6bb98fc7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 04:45:24 -0300 Subject: [PATCH 116/416] jenkins: sync harbor pull secret from vault --- services/jenkins/kustomization.yaml | 3 ++ services/jenkins/secretproviderclass.yaml | 21 ++++++++++++ services/jenkins/vault-serviceaccount.yaml | 6 ++++ services/jenkins/vault-sync-deployment.yaml | 34 +++++++++++++++++++ .../vault/scripts/vault_k8s_auth_configure.sh | 2 +- 5 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 services/jenkins/secretproviderclass.yaml create mode 100644 services/jenkins/vault-serviceaccount.yaml create mode 100644 services/jenkins/vault-sync-deployment.yaml diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index aab859a..df51968 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -5,11 +5,14 @@ namespace: jenkins resources: - namespace.yaml - serviceaccount.yaml + - vault-serviceaccount.yaml - pvc.yaml - cache-pvc.yaml - plugins-pvc.yaml - configmap-jcasc.yaml - configmap-plugins.yaml + - secretproviderclass.yaml + - vault-sync-deployment.yaml - deployment.yaml - service.yaml - ingress.yaml diff --git a/services/jenkins/secretproviderclass.yaml b/services/jenkins/secretproviderclass.yaml new file mode 100644 index 0000000..a9d9dd5 --- /dev/null +++ b/services/jenkins/secretproviderclass.yaml @@ -0,0 +1,21 @@ +# services/jenkins/secretproviderclass.yaml +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: jenkins-vault + namespace: jenkins +spec: + provider: vault + parameters: + vaultAddress: "http://vault.vault.svc.cluster.local:8200" + roleName: "jenkins" + objects: | + - objectName: "harbor-pull__dockerconfigjson" + secretPath: "kv/data/atlas/shared/harbor-pull" + secretKey: "dockerconfigjson" + secretObjects: + - secretName: harbor-bstein-robot + type: kubernetes.io/dockerconfigjson + data: + - objectName: harbor-pull__dockerconfigjson + key: .dockerconfigjson diff --git a/services/jenkins/vault-serviceaccount.yaml b/services/jenkins/vault-serviceaccount.yaml new file mode 100644 index 0000000..8d31400 --- /dev/null +++ b/services/jenkins/vault-serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/jenkins/vault-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: jenkins-vault-sync + namespace: jenkins diff --git a/services/jenkins/vault-sync-deployment.yaml b/services/jenkins/vault-sync-deployment.yaml new file mode 100644 index 0000000..6de64f9 --- /dev/null +++ b/services/jenkins/vault-sync-deployment.yaml @@ -0,0 +1,34 @@ +# services/jenkins/vault-sync-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: jenkins-vault-sync + namespace: jenkins +spec: + replicas: 1 + selector: + matchLabels: + app: jenkins-vault-sync + template: + metadata: + labels: + app: jenkins-vault-sync + spec: + serviceAccountName: jenkins-vault-sync + containers: + - name: sync + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - "sleep infinity" + volumeMounts: + - name: vault-secrets + mountPath: /vault/secrets + readOnly: true + volumes: + - name: vault-secrets + csi: + driver: secrets-store.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: jenkins-vault diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index 00fa567..a956e0e 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -218,7 +218,7 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \ "nextcloud/* shared/keycloak-admin shared/postmark-relay" "" write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \ "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" "" -write_policy_and_role "jenkins" "jenkins" "jenkins" \ +write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \ "jenkins/* shared/harbor-pull" "" write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \ "monitoring/* shared/postmark-relay shared/harbor-pull" "" From 5e35b5f7a22579469f079292252007a0064b0b69 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 04:47:50 -0300 Subject: [PATCH 117/416] vault: unsuspend k8s auth config cronjob --- services/vault/k8s-auth-config-cronjob.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml index e7cca14..43da16b 100644 --- a/services/vault/k8s-auth-config-cronjob.yaml +++ b/services/vault/k8s-auth-config-cronjob.yaml @@ -8,7 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" - suspend: true + suspend: false concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 From d2f118ed3206fd9157d204877762eb89dd7d2e38 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 10:56:27 -0300 Subject: [PATCH 118/416] jenkins: pin vault sync to worker nodes --- services/jenkins/vault-sync-deployment.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/jenkins/vault-sync-deployment.yaml b/services/jenkins/vault-sync-deployment.yaml index 6de64f9..6abcace 100644 --- a/services/jenkins/vault-sync-deployment.yaml +++ b/services/jenkins/vault-sync-deployment.yaml @@ -15,6 +15,9 @@ spec: app: jenkins-vault-sync spec: serviceAccountName: jenkins-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" containers: - name: sync image: alpine:3.20 From 0697d7b1b3f2bb9cd238c6302a826061bd528209 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 12:41:58 -0300 Subject: [PATCH 119/416] keycloak: allow harbor direct grants --- .../harbor-oidc-secret-ensure-job.yaml | 2 +- .../scripts/harbor_oidc_secret_ensure.sh | 37 ++++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/services/keycloak/harbor-oidc-secret-ensure-job.yaml b/services/keycloak/harbor-oidc-secret-ensure-job.yaml index 8eac50d..87de463 100644 --- a/services/keycloak/harbor-oidc-secret-ensure-job.yaml +++ b/services/keycloak/harbor-oidc-secret-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: harbor-oidc-secret-ensure-9 + name: harbor-oidc-secret-ensure-10 namespace: sso spec: backoffLimit: 0 diff --git a/services/keycloak/scripts/harbor_oidc_secret_ensure.sh b/services/keycloak/scripts/harbor_oidc_secret_ensure.sh index 7187d34..c70caa2 100755 --- a/services/keycloak/scripts/harbor_oidc_secret_ensure.sh +++ b/services/keycloak/scripts/harbor_oidc_secret_ensure.sh @@ -29,7 +29,7 @@ CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)" if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then - create_payload='{"clientId":"harbor","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://registry.bstein.dev/c/oidc/callback"],"webOrigins":["https://registry.bstein.dev"],"rootUrl":"https://registry.bstein.dev","baseUrl":"/"}' + create_payload='{"clientId":"harbor","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":true,"serviceAccountsEnabled":false,"redirectUris":["https://registry.bstein.dev/c/oidc/callback"],"webOrigins":["https://registry.bstein.dev"],"rootUrl":"https://registry.bstein.dev","baseUrl":"/"}' status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \ -H "Authorization: Bearer ${ACCESS_TOKEN}" \ -H 'Content-Type: application/json' \ @@ -49,6 +49,21 @@ if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then exit 1 fi +CLIENT_CONFIG="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}" || true)" +if [ -n "$CLIENT_CONFIG" ]; then + updated_config="$(echo "$CLIENT_CONFIG" | jq '.directAccessGrantsEnabled=true')" + status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "${updated_config}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")" + if [ "$status" != "200" ] && [ "$status" != "204" ]; then + echo "Keycloak client update failed (status ${status})" >&2 + exit 1 + fi +fi + SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ "$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)" if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then @@ -77,6 +92,26 @@ if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2 fi fi +OFFLINE_SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/client-scopes?search=offline_access" | jq -r '.[] | select(.name=="offline_access") | .id' 2>/dev/null | head -n1 || true)" +if [ -n "$OFFLINE_SCOPE_ID" ] && [ "$OFFLINE_SCOPE_ID" != "null" ]; then + if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="offline_access")' >/dev/null 2>&1 \ + && ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="offline_access")' >/dev/null 2>&1; then + status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${OFFLINE_SCOPE_ID}")" + if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then + status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${OFFLINE_SCOPE_ID}")" + if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then + echo "Failed to attach offline_access scope to harbor (status ${status})" >&2 + exit 1 + fi + fi + fi +fi + CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)" if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then From d9695d32f6128872e8f0d32838a55712f85b78d4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 13:26:38 -0300 Subject: [PATCH 120/416] harbor: route v2 ingress to registry --- services/harbor/helmrelease.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/services/harbor/helmrelease.yaml b/services/harbor/helmrelease.yaml index b0cbdbd..db01787 100644 --- a/services/harbor/helmrelease.yaml +++ b/services/harbor/helmrelease.yaml @@ -378,6 +378,16 @@ spec: subPath: app.conf - name: ca-download mountPath: /etc/core/ca + - target: + kind: Ingress + name: harbor-ingress + patch: |- + - op: replace + path: /spec/rules/0/http/paths/2/backend/service/name + value: harbor-registry + - op: replace + path: /spec/rules/0/http/paths/2/backend/service/port/number + value: 5000 - name: psc mountPath: /etc/core/token volumes: From 3fc17b0c7c2601ee4492718d79897002b53596e4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 13:31:12 -0300 Subject: [PATCH 121/416] harbor: fix ingress patch placement --- services/harbor/helmrelease.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/services/harbor/helmrelease.yaml b/services/harbor/helmrelease.yaml index db01787..16b81a8 100644 --- a/services/harbor/helmrelease.yaml +++ b/services/harbor/helmrelease.yaml @@ -378,16 +378,6 @@ spec: subPath: app.conf - name: ca-download mountPath: /etc/core/ca - - target: - kind: Ingress - name: harbor-ingress - patch: |- - - op: replace - path: /spec/rules/0/http/paths/2/backend/service/name - value: harbor-registry - - op: replace - path: /spec/rules/0/http/paths/2/backend/service/port/number - value: 5000 - name: psc mountPath: /etc/core/token volumes: @@ -401,6 +391,16 @@ spec: $patch: delete - name: core-writable emptyDir: {} + - target: + kind: Ingress + name: harbor-ingress + patch: |- + - op: replace + path: /spec/rules/0/http/paths/2/backend/service/name + value: harbor-registry + - op: replace + path: /spec/rules/0/http/paths/2/backend/service/port/number + value: 5000 - target: kind: Deployment name: harbor-jobservice From 8e3fe266aa1629c089f8e53f04da432443b2d43c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 13:38:06 -0300 Subject: [PATCH 122/416] flux: temporarily drop harbor health checks --- .../atlas/flux-system/applications/harbor/kustomization.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml index 06baf26..5eec32f 100644 --- a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml @@ -13,11 +13,6 @@ spec: kind: GitRepository name: flux-system namespace: flux-system - healthChecks: - - apiVersion: helm.toolkit.fluxcd.io/v2 - kind: HelmRelease - name: harbor - namespace: harbor wait: false dependsOn: - name: core From 156effebe318621d0a6f04537d40ac5f6865150b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 14:09:39 -0300 Subject: [PATCH 123/416] ops: pause portal/ariadne and add migrate jobs --- .../bstein-dev-home/backend-deployment.yaml | 16 ++++++- .../chat-ai-gateway-deployment.yaml | 2 +- .../bstein-dev-home/frontend-deployment.yaml | 2 +- services/bstein-dev-home/kustomization.yaml | 1 + .../bstein-dev-home/portal-migrate-job.yaml | 41 ++++++++++++++++++ .../vault-sync-deployment.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 16 ++++++- services/maintenance/ariadne-migrate-job.yaml | 42 +++++++++++++++++++ services/maintenance/kustomization.yaml | 1 + 9 files changed, 118 insertions(+), 5 deletions(-) create mode 100644 services/bstein-dev-home/portal-migrate-job.yaml create mode 100644 services/maintenance/ariadne-migrate-job.yaml diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 074a19d..100c3eb 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-backend namespace: bstein-dev-home spec: - replicas: 1 + replicas: 0 revisionHistoryLimit: 3 selector: matchLabels: @@ -99,6 +99,20 @@ spec: value: "" - name: HTTP_CHECK_TIMEOUT_SEC value: "2" + - name: PORTAL_DB_POOL_MIN + value: "0" + - name: PORTAL_DB_POOL_MAX + value: "5" + - name: PORTAL_DB_CONNECT_TIMEOUT_SEC + value: "5" + - name: PORTAL_DB_LOCK_TIMEOUT_SEC + value: "5" + - name: PORTAL_DB_STATEMENT_TIMEOUT_SEC + value: "30" + - name: PORTAL_DB_IDLE_IN_TX_TIMEOUT_SEC + value: "10" + - name: PORTAL_RUN_MIGRATIONS + value: "false" - name: ACCESS_REQUEST_SUBMIT_RATE_LIMIT value: "30" - name: ACCESS_REQUEST_SUBMIT_RATE_WINDOW_SEC diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml index 40d74fe..3010a9b 100644 --- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml +++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: chat-ai-gateway namespace: bstein-dev-home spec: - replicas: 1 + replicas: 0 revisionHistoryLimit: 2 selector: matchLabels: diff --git a/services/bstein-dev-home/frontend-deployment.yaml b/services/bstein-dev-home/frontend-deployment.yaml index ef26e73..bbe5981 100644 --- a/services/bstein-dev-home/frontend-deployment.yaml +++ b/services/bstein-dev-home/frontend-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-frontend namespace: bstein-dev-home spec: - replicas: 1 + replicas: 0 revisionHistoryLimit: 3 selector: matchLabels: diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 192ad7e..28bbc3a 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -15,6 +15,7 @@ resources: - frontend-service.yaml - backend-deployment.yaml - backend-service.yaml + - portal-migrate-job.yaml - vaultwarden-cred-sync-cronjob.yaml - portal-onboarding-e2e-test-job.yaml - ingress.yaml diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/portal-migrate-job.yaml new file mode 100644 index 0000000..303a04f --- /dev/null +++ b/services/bstein-dev-home/portal-migrate-job.yaml @@ -0,0 +1,41 @@ +# services/bstein-dev-home/portal-migrate-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: bstein-dev-home-portal-migrate + namespace: bstein-dev-home +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + metadata: + labels: + app: bstein-dev-home-portal-migrate + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: "bstein-dev-home" + vault.hashicorp.com/agent-inject-secret-portal-env.sh: "kv/data/atlas/portal/atlas-portal-db" + vault.hashicorp.com/agent-inject-template-portal-env.sh: | + {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" + {{ end }} + spec: + serviceAccountName: bstein-dev-home + restartPolicy: Never + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + imagePullSecrets: + - name: harbor-regcred + containers: + - name: migrate + image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-95 + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - >- + . /vault/secrets/portal-env.sh + && exec python -m atlas_portal.migrate + env: + - name: PORTAL_RUN_MIGRATIONS + value: "true" diff --git a/services/bstein-dev-home/vault-sync-deployment.yaml b/services/bstein-dev-home/vault-sync-deployment.yaml index ad50f1e..2f2ddbb 100644 --- a/services/bstein-dev-home/vault-sync-deployment.yaml +++ b/services/bstein-dev-home/vault-sync-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-vault-sync namespace: bstein-dev-home spec: - replicas: 1 + replicas: 0 selector: matchLabels: app: bstein-dev-home-vault-sync diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 01e940c..e11f8db 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: ariadne namespace: maintenance spec: - replicas: 1 + replicas: 0 revisionHistoryLimit: 3 selector: matchLabels: @@ -129,6 +129,20 @@ spec: value: https://bstein.dev - name: ARIADNE_LOG_LEVEL value: INFO + - name: ARIADNE_DB_POOL_MIN + value: "0" + - name: ARIADNE_DB_POOL_MAX + value: "5" + - name: ARIADNE_DB_CONNECT_TIMEOUT_SEC + value: "5" + - name: ARIADNE_DB_LOCK_TIMEOUT_SEC + value: "5" + - name: ARIADNE_DB_STATEMENT_TIMEOUT_SEC + value: "30" + - name: ARIADNE_DB_IDLE_IN_TX_TIMEOUT_SEC + value: "10" + - name: ARIADNE_RUN_MIGRATIONS + value: "false" - name: PORTAL_ADMIN_USERS value: bstein - name: PORTAL_ADMIN_GROUPS diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml new file mode 100644 index 0000000..472cf5f --- /dev/null +++ b/services/maintenance/ariadne-migrate-job.yaml @@ -0,0 +1,42 @@ +# services/maintenance/ariadne-migrate-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: ariadne-migrate + namespace: maintenance +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + metadata: + labels: + app: ariadne-migrate + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: "maintenance" + vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" + vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | + {{ with secret "kv/data/atlas/maintenance/ariadne-db" }} + export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}" + {{ end }} + {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" + {{ end }} + spec: + serviceAccountName: ariadne + restartPolicy: Never + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: migrate + image: registry.bstein.dev/bstein/ariadne:0.1.0-0 + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - >- + . /vault/secrets/ariadne-env.sh + && exec python -m ariadne.migrate + env: + - name: ARIADNE_RUN_MIGRATIONS + value: "true" diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 1f1c731..c1350eb 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -14,6 +14,7 @@ resources: - node-nofile-serviceaccount.yaml - pod-cleaner-rbac.yaml - ariadne-deployment.yaml + - ariadne-migrate-job.yaml - ariadne-service.yaml - disable-k3s-traefik-daemonset.yaml - k3s-traefik-cleanup-job.yaml From d509dfaa22af75e0eff55a0a40aec1de57515a6d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 15:23:23 -0300 Subject: [PATCH 124/416] ops: restore portal/ariadne and add postgres panels --- scripts/dashboards_render_atlas.py | 36 ++++- .../bstein-dev-home/backend-deployment.yaml | 2 +- .../bstein-dev-home/frontend-deployment.yaml | 2 +- .../vault-sync-deployment.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 2 +- .../monitoring/dashboards/atlas-overview.json | 138 +++++++++++++++++- .../grafana-dashboard-overview.yaml | 138 +++++++++++++++++- 7 files changed, 298 insertions(+), 22 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 1f28489..f55896a 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -371,6 +371,10 @@ ARIADNE_TEST_SUCCESS_RATE = ( ARIADNE_TEST_FAILURES_24H = ( 'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' ) +POSTGRES_CONN_USED_PCT = ( + "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)" +) +POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))' ONEOFF_JOB_OWNER = ( 'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")' ) @@ -1057,7 +1061,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 3, "w": 6, "x": 0, "y": 8}, + {"h": 3, "w": 4, "x": 0, "y": 8}, unit="none", links=link_to("atlas-mail"), ) @@ -1068,7 +1072,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 3, "w": 6, "x": 12, "y": 8}, + "gridPos": {"h": 3, "w": 4, "x": 8, "y": 8}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1114,7 +1118,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 3, "w": 6, "x": 6, "y": 8}, + {"h": 3, "w": 4, "x": 4, "y": 8}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1126,13 +1130,37 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 3, "w": 6, "x": 18, "y": 8}, + {"h": 3, "w": 4, "x": 12, "y": 8}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, links=link_to("atlas-mail"), ) ) + panels.append( + gauge_panel( + 34, + "Postgres Connections Used", + POSTGRES_CONN_USED_PCT, + {"h": 3, "w": 4, "x": 16, "y": 8}, + min_value=0, + max_value=100, + thresholds=PERCENT_THRESHOLDS, + ) + ) + panels.append( + stat_panel( + 35, + "Postgres Hottest Connections", + POSTGRES_CONN_HOTTEST, + {"h": 3, "w": 4, "x": 20, "y": 8}, + unit="none", + decimals=0, + text_mode="name_and_value", + legend="{{datname}}", + instant=True, + ) + ) storage_panels = [ (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 100c3eb..2170396 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-backend namespace: bstein-dev-home spec: - replicas: 0 + replicas: 1 revisionHistoryLimit: 3 selector: matchLabels: diff --git a/services/bstein-dev-home/frontend-deployment.yaml b/services/bstein-dev-home/frontend-deployment.yaml index bbe5981..ef26e73 100644 --- a/services/bstein-dev-home/frontend-deployment.yaml +++ b/services/bstein-dev-home/frontend-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-frontend namespace: bstein-dev-home spec: - replicas: 0 + replicas: 1 revisionHistoryLimit: 3 selector: matchLabels: diff --git a/services/bstein-dev-home/vault-sync-deployment.yaml b/services/bstein-dev-home/vault-sync-deployment.yaml index 2f2ddbb..ad50f1e 100644 --- a/services/bstein-dev-home/vault-sync-deployment.yaml +++ b/services/bstein-dev-home/vault-sync-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-vault-sync namespace: bstein-dev-home spec: - replicas: 0 + replicas: 1 selector: matchLabels: app: bstein-dev-home-vault-sync diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index e11f8db..581947c 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: ariadne namespace: maintenance spec: - replicas: 0 + replicas: 1 revisionHistoryLimit: 3 selector: matchLabels: diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 78744da..93a2d80 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -796,7 +796,7 @@ }, "gridPos": { "h": 3, - "w": 6, + "w": 4, "x": 0, "y": 8 }, @@ -863,8 +863,8 @@ }, "gridPos": { "h": 3, - "w": 6, - "x": 12, + "w": 4, + "x": 8, "y": 8 }, "targets": [ @@ -968,8 +968,8 @@ }, "gridPos": { "h": 3, - "w": 6, - "x": 6, + "w": 4, + "x": 4, "y": 8 }, "targets": [ @@ -1044,8 +1044,8 @@ }, "gridPos": { "h": 3, - "w": 6, - "x": 18, + "w": 4, + "x": 12, "y": 8 }, "targets": [ @@ -1110,6 +1110,130 @@ } ] }, + { + "id": 34, + "type": "gauge", + "title": "Postgres Connections Used", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 8 + }, + "targets": [ + { + "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false + } + }, + { + "id": 35, + "type": "stat", + "title": "Postgres Hottest Connections", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 8 + }, + "targets": [ + { + "expr": "topk(1, sum by (datname) (pg_stat_activity_count))", + "refId": "A", + "legendFormat": "{{datname}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + } + }, { "id": 23, "type": "stat", diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index fa19911..0e9526e 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -805,7 +805,7 @@ data: }, "gridPos": { "h": 3, - "w": 6, + "w": 4, "x": 0, "y": 8 }, @@ -872,8 +872,8 @@ data: }, "gridPos": { "h": 3, - "w": 6, - "x": 12, + "w": 4, + "x": 8, "y": 8 }, "targets": [ @@ -977,8 +977,8 @@ data: }, "gridPos": { "h": 3, - "w": 6, - "x": 6, + "w": 4, + "x": 4, "y": 8 }, "targets": [ @@ -1053,8 +1053,8 @@ data: }, "gridPos": { "h": 3, - "w": 6, - "x": 18, + "w": 4, + "x": 12, "y": 8 }, "targets": [ @@ -1119,6 +1119,130 @@ data: } ] }, + { + "id": 34, + "type": "gauge", + "title": "Postgres Connections Used", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 8 + }, + "targets": [ + { + "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false + } + }, + { + "id": 35, + "type": "stat", + "title": "Postgres Hottest Connections", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 8 + }, + "targets": [ + { + "expr": "topk(1, sum by (datname) (pg_stat_activity_count))", + "refId": "A", + "legendFormat": "{{datname}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + } + }, { "id": 23, "type": "stat", From 8788d40dc6c898456fa9a450179308fa5c0b63a5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 15:28:26 -0300 Subject: [PATCH 125/416] ops: bump portal and ariadne image tags --- services/bstein-dev-home/kustomization.yaml | 4 ++-- services/maintenance/kustomization.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 28bbc3a..7c431b2 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,9 +21,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index c1350eb..992c889 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From da32ba1680658bb0b971aa682e8dc40f0bb18d3d Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:29:01 +0000 Subject: [PATCH 126/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7c431b2..28bbc3a 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,9 +21,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From af024aa16a69cbdbbb6ce73073b7e8d3c58a5a6d Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:29:24 +0000 Subject: [PATCH 127/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 992c889..c1350eb 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From ec5e4ec4a3c5bd778177f67181b9d933146dde6b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 15:33:08 -0300 Subject: [PATCH 128/416] images: auth image scan and bump tags --- services/bstein-dev-home/image.yaml | 4 ++++ services/bstein-dev-home/kustomization.yaml | 4 ++-- services/maintenance/image.yaml | 2 ++ services/maintenance/kustomization.yaml | 2 +- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/services/bstein-dev-home/image.yaml b/services/bstein-dev-home/image.yaml index 3b6c757..eed2736 100644 --- a/services/bstein-dev-home/image.yaml +++ b/services/bstein-dev-home/image.yaml @@ -7,6 +7,8 @@ metadata: spec: image: registry.bstein.dev/bstein/bstein-dev-home-frontend interval: 1m0s + secretRef: + name: harbor-regcred --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImagePolicy @@ -28,6 +30,8 @@ metadata: spec: image: registry.bstein.dev/bstein/bstein-dev-home-backend interval: 1m0s + secretRef: + name: harbor-regcred --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImagePolicy diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 28bbc3a..7c431b2 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,9 +21,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home diff --git a/services/maintenance/image.yaml b/services/maintenance/image.yaml index 95acbd0..fd28d90 100644 --- a/services/maintenance/image.yaml +++ b/services/maintenance/image.yaml @@ -7,6 +7,8 @@ metadata: spec: image: registry.bstein.dev/bstein/ariadne interval: 1m0s + secretRef: + name: harbor-regcred --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImagePolicy diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index c1350eb..992c889 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 21800290ecc46e5f660b7d9d0062a2e9208003a9 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:33:30 +0000 Subject: [PATCH 129/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 992c889..c1350eb 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 8b90b44dfd2b364d9333c8bca7891f0aa0c1c8ec Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:33:48 +0000 Subject: [PATCH 130/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index c1350eb..992c889 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 7eba40a889c33db1164a24867752bc86f9e1c68c Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:34:08 +0000 Subject: [PATCH 131/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7c431b2..28bbc3a 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,9 +21,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From efa893b134620983bf3f4ec31a131d5357cb7ce5 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:35:15 +0000 Subject: [PATCH 132/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 28bbc3a..8bfc8a5 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 05c7642f5c5a261668bf127ca3116b566fd7f841 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:35:20 +0000 Subject: [PATCH 133/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 8bfc8a5..7c431b2 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 9b5d8ac45cba4df7045cf9c0b8206d8db4a255ae Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 15:39:57 -0300 Subject: [PATCH 134/416] jobs: force recreate migrate jobs --- services/bstein-dev-home/portal-migrate-job.yaml | 2 ++ services/maintenance/ariadne-migrate-job.yaml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/portal-migrate-job.yaml index 303a04f..a578b8c 100644 --- a/services/bstein-dev-home/portal-migrate-job.yaml +++ b/services/bstein-dev-home/portal-migrate-job.yaml @@ -4,6 +4,8 @@ kind: Job metadata: name: bstein-dev-home-portal-migrate namespace: bstein-dev-home + annotations: + kustomize.toolkit.fluxcd.io/force: "true" spec: backoffLimit: 1 ttlSecondsAfterFinished: 3600 diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml index 472cf5f..3528f9b 100644 --- a/services/maintenance/ariadne-migrate-job.yaml +++ b/services/maintenance/ariadne-migrate-job.yaml @@ -4,6 +4,8 @@ kind: Job metadata: name: ariadne-migrate namespace: maintenance + annotations: + kustomize.toolkit.fluxcd.io/force: "true" spec: backoffLimit: 1 ttlSecondsAfterFinished: 3600 From 0290a5f7150ac409b7f48895ea3c563e52b5b439 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:47:16 +0000 Subject: [PATCH 135/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7c431b2..3075a66 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 1e8a67904c75e06bc3257edfdd7092934ebdf3c2 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:48:16 +0000 Subject: [PATCH 136/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 3075a66..c03f2c7 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From e5281ad4c0467cfa63bde25a2baa999c099a063c Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:00:29 +0000 Subject: [PATCH 137/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index c03f2c7..38b7c40 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 1e815ce011bc7bd412e220fd7b7f10f9c19ee214 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:00:34 +0000 Subject: [PATCH 138/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 38b7c40..4eaed54 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From cca3a756b35714c9ad6908c2aba5129349cddd25 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:02:01 +0000 Subject: [PATCH 139/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 992c889..2de807e 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-37 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 820e624a0b729085cdafd24be4a50c169654d219 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 17:58:53 -0300 Subject: [PATCH 140/416] jenkins: set timezone to America/Chicago --- services/jenkins/deployment.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 0dc76af..63f722b 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -108,7 +108,9 @@ spec: containerPort: 50000 env: - name: JAVA_OPTS - value: "-Xms512m -Xmx2048m" + value: "-Xms512m -Xmx2048m -Duser.timezone=America/Chicago" + - name: TZ + value: "America/Chicago" - name: JENKINS_OPTS value: "--webroot=/var/jenkins_cache/war" - name: JENKINS_SLAVE_AGENT_PORT From ce5b1d135377192fd9e26587a5c73d50b2117578 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 18:23:17 -0300 Subject: [PATCH 141/416] monitoring: add postgres metrics and update overview --- infrastructure/postgres/service.yaml | 8 ++++ infrastructure/postgres/statefulset.yaml | 17 ++++++++ scripts/dashboards_render_atlas.py | 16 +++---- .../monitoring/dashboards/atlas-overview.json | 42 ++++++++++--------- .../grafana-dashboard-overview.yaml | 42 ++++++++++--------- 5 files changed, 78 insertions(+), 47 deletions(-) diff --git a/infrastructure/postgres/service.yaml b/infrastructure/postgres/service.yaml index 3dcab3c..b695045 100644 --- a/infrastructure/postgres/service.yaml +++ b/infrastructure/postgres/service.yaml @@ -4,6 +4,10 @@ kind: Service metadata: name: postgres-service namespace: postgres + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9187" + prometheus.io/path: "/metrics" spec: clusterIP: None ports: @@ -11,5 +15,9 @@ spec: port: 5432 protocol: TCP targetPort: 5432 + - name: metrics + port: 9187 + protocol: TCP + targetPort: 9187 selector: app: postgres diff --git a/infrastructure/postgres/statefulset.yaml b/infrastructure/postgres/statefulset.yaml index e1a1921..2c79248 100644 --- a/infrastructure/postgres/statefulset.yaml +++ b/infrastructure/postgres/statefulset.yaml @@ -58,6 +58,23 @@ spec: - name: vault-secrets mountPath: /mnt/vault readOnly: true + - name: postgres-exporter + image: quay.io/prometheuscommunity/postgres-exporter:v0.15.0 + ports: + - name: metrics + containerPort: 9187 + protocol: TCP + env: + - name: DATA_SOURCE_URI + value: "localhost:5432/postgres?sslmode=disable" + - name: DATA_SOURCE_USER + value: postgres + - name: DATA_SOURCE_PASS_FILE + value: /mnt/vault/postgres_password + volumeMounts: + - name: vault-secrets + mountPath: /mnt/vault + readOnly: true volumes: - name: vault-secrets csi: diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index f55896a..11479d9 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -371,8 +371,9 @@ ARIADNE_TEST_SUCCESS_RATE = ( ARIADNE_TEST_FAILURES_24H = ( 'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' ) -POSTGRES_CONN_USED_PCT = ( - "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)" +POSTGRES_CONN_USED = ( + 'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") ' + 'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")' ) POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))' ONEOFF_JOB_OWNER = ( @@ -1138,14 +1139,15 @@ def build_overview(): ) ) panels.append( - gauge_panel( + stat_panel( 34, "Postgres Connections Used", - POSTGRES_CONN_USED_PCT, + POSTGRES_CONN_USED, {"h": 3, "w": 4, "x": 16, "y": 8}, - min_value=0, - max_value=100, - thresholds=PERCENT_THRESHOLDS, + decimals=0, + text_mode="name_and_value", + legend="{{conn}}", + instant=True, ) ) panels.append( diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 93a2d80..2d7f3e5 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1112,7 +1112,7 @@ }, { "id": 34, - "type": "gauge", + "type": "stat", "title": "Postgres Connections Used", "datasource": { "type": "prometheus", @@ -1126,39 +1126,43 @@ }, "targets": [ { - "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)", - "refId": "A" + "expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")", + "refId": "A", + "legendFormat": "{{conn}}", + "instant": true } ], "fieldConfig": { "defaults": { - "min": 0, - "max": 100, + "color": { + "mode": "thresholds" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 75 - }, - { - "color": "red", - "value": 91.5 + "color": "green", + "value": 1 } ] - } + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -1166,9 +1170,7 @@ "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "name_and_value" } }, { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 0e9526e..5336134 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1121,7 +1121,7 @@ data: }, { "id": 34, - "type": "gauge", + "type": "stat", "title": "Postgres Connections Used", "datasource": { "type": "prometheus", @@ -1135,39 +1135,43 @@ data: }, "targets": [ { - "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)", - "refId": "A" + "expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")", + "refId": "A", + "legendFormat": "{{conn}}", + "instant": true } ], "fieldConfig": { "defaults": { - "min": 0, - "max": 100, + "color": { + "mode": "thresholds" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 75 - }, - { - "color": "red", - "value": 91.5 + "color": "green", + "value": 1 } ] - } + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -1175,9 +1179,7 @@ data: "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "name_and_value" } }, { From 7c9ee41180bc7e4b5e90cdd69e096ae0ef51a972 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:41:04 +0000 Subject: [PATCH 142/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 2de807e..6f5b7dc 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-37 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-38 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 71996fb199697691e88767d76cae35fb09044132 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:51:32 +0000 Subject: [PATCH 143/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 4eaed54..cebb191 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 0c27b48a1ce8158029ddc7701e7bc92255d53123 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:53:32 +0000 Subject: [PATCH 144/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index cebb191..3ff70ab 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 301909f92ea36ca675bbd76c4051aa66955d9933 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 22:08:33 +0000 Subject: [PATCH 145/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 3ff70ab..4e811e0 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 8b7e21f0cc9fd89f557a75982f98129d8e8e5ab7 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 22:08:37 +0000 Subject: [PATCH 146/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 4e811e0..7dbfa1c 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 25c4f3e07b6e48828a4cd1c36115662f9ed2a934 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 22:16:34 +0000 Subject: [PATCH 147/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7dbfa1c..200ee58 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 8913c5a5f209c9844f4dec5e4643b5edd2fe6c67 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 22:16:37 +0000 Subject: [PATCH 148/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 200ee58..d4f2e02 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 4d566a738833a9e54a4b98a80b46769dcc4ce939 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:07:49 +0000 Subject: [PATCH 149/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d4f2e02..459c63d 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 5ca247f143c939cb3faa017c12a365bd6d8f8444 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:08:49 +0000 Subject: [PATCH 150/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 459c63d..d9fa7c0 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 8545f2bc50a0a4f0a2859f876581d5997bd7166f Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:12:49 +0000 Subject: [PATCH 151/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d9fa7c0..f651a92 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From d3d680383b32dc86f8f5d3a19b4c1f499c1cac98 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:14:49 +0000 Subject: [PATCH 152/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f651a92..78f1cae 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 62d16ae388fc85214b98c79e662950615a6ac62b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:32:51 +0000 Subject: [PATCH 153/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 78f1cae..ae77c9a 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-128 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From a18f7e98a20cc1be47f8c051a99eaf7d2cd51bf2 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:42:52 +0000 Subject: [PATCH 154/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index ae77c9a..26b8536 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-128 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-129 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 8dedefb4b46e04650a0eedbfbee9b29c26451bcd Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:51:53 +0000 Subject: [PATCH 155/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 26b8536..48f5bf7 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-129 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 2dc680b8f8f79a7be5d680b55efcb6d2fbaad15d Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:52:53 +0000 Subject: [PATCH 156/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 48f5bf7..b5f5319 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From e3247f606fc4fa15342aacaa041575a5fef659ab Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 02:46:57 +0000 Subject: [PATCH 157/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index b5f5319..d2512be 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From b1fa40acc10dce16e6ca0de1d379ba7455dbd3c3 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 02:47:58 +0000 Subject: [PATCH 158/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d2512be..f36c317 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 72e6a09bd0298dfa82b65237fc30d0e5fa08806a Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:01:59 +0000 Subject: [PATCH 159/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f36c317..912cd1f 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From e0bf10cad9ac42408bedb2062d939152adbac81f Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:02:59 +0000 Subject: [PATCH 160/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 912cd1f..8b47e2e 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 1d39015d33c31a93e2ce321ca7946ba342999b84 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:10:59 +0000 Subject: [PATCH 161/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 8b47e2e..c83d9f3 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 2475d4ca9d55ced8f67d5cd7f5af53cea9e7c981 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:11:03 +0000 Subject: [PATCH 162/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index c83d9f3..81931f2 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 4c66b538a7e2738e02b8f3986ea9e242f06a7301 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:38:02 +0000 Subject: [PATCH 163/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 81931f2..aab9154 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 3474df40d42dcf0da8db4d071af7765ae2f882ee Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:39:02 +0000 Subject: [PATCH 164/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index aab9154..45a2d81 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 58d9cb616f2a9c6a39b284f180c52e8301de3b90 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 01:35:15 -0300 Subject: [PATCH 165/416] comms: enable MSC4108 rendezvous in synapse --- services/comms/helmrelease.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/comms/helmrelease.yaml b/services/comms/helmrelease.yaml index 4456348..e6536fa 100644 --- a/services/comms/helmrelease.yaml +++ b/services/comms/helmrelease.yaml @@ -138,6 +138,8 @@ spec: auto_join_rooms: - "#othrys:live.bstein.dev" autocreate_auto_join_rooms: true + experimental: + msc4108_enabled: true default_room_version: "11" experimental_features: msc3266_enabled: true From 3d633a5627190fa53788f57d61f7f2c22aae1dd1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 01:46:03 -0300 Subject: [PATCH 166/416] comms: enable MSC4108 under experimental_features --- services/comms/helmrelease.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/services/comms/helmrelease.yaml b/services/comms/helmrelease.yaml index e6536fa..eeac49e 100644 --- a/services/comms/helmrelease.yaml +++ b/services/comms/helmrelease.yaml @@ -138,10 +138,9 @@ spec: auto_join_rooms: - "#othrys:live.bstein.dev" autocreate_auto_join_rooms: true - experimental: - msc4108_enabled: true default_room_version: "11" experimental_features: + msc4108_enabled: true msc3266_enabled: true msc4143_enabled: true msc4222_enabled: true From 3cacbad4c0cb346bdc23165999e29d670f97f8f9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 02:04:51 -0300 Subject: [PATCH 167/416] comms/keycloak: add mailu email claim --- services/comms/mas-configmap.yaml | 2 +- services/keycloak/realm-settings-job.yaml | 47 +++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/services/comms/mas-configmap.yaml b/services/comms/mas-configmap.yaml index 5e6cfdd..9d2c11e 100644 --- a/services/comms/mas-configmap.yaml +++ b/services/comms/mas-configmap.yaml @@ -72,7 +72,7 @@ data: template: "{{ user.name }}" email: action: force - template: "{{ user.email }}" + template: "{{ user.mailu_email }}" policy: data: diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index 6e6589d..e94076c 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -542,6 +542,53 @@ spec: if status not in (201, 204): raise SystemExit(f"Unexpected mailu email mapper create response: {status}") + mailu_claim_mapper = { + "name": "mailu-email-claim", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": False, + "config": { + "user.attribute": "mailu_email", + "claim.name": "mailu_email", + "jsonType.label": "String", + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true", + "multivalued": "false", + "aggregate.attrs": "false", + }, + } + status, mappers = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + ) + existing_claim = None + if status == 200 and isinstance(mappers, list): + for item in mappers: + if isinstance(item, dict) and item.get("name") == mailu_claim_mapper["name"]: + existing_claim = item + break + if existing_claim and existing_claim.get("id"): + mailu_claim_mapper["id"] = existing_claim["id"] + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models/{existing_claim['id']}", + access_token, + mailu_claim_mapper, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected mailu email claim mapper update response: {status}") + else: + status, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + mailu_claim_mapper, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected mailu email claim mapper create response: {status}") + # Ensure MFA is on by default for newly-created users. status, required_actions = http_json( "GET", From 18ac46d4b83f17854dca7069e60d41bad90eb806 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 02:09:53 -0300 Subject: [PATCH 168/416] keycloak: bump realm settings job --- services/keycloak/realm-settings-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index e94076c..0de48d1 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: keycloak-realm-settings-34 + name: keycloak-realm-settings-35 namespace: sso spec: backoffLimit: 0 From 3d2e0ead1c7db0d21a542a6916d12db129d17cbf Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 03:11:42 -0300 Subject: [PATCH 169/416] portal: bump migrate job name --- services/bstein-dev-home/portal-migrate-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/portal-migrate-job.yaml index a578b8c..2cb2a12 100644 --- a/services/bstein-dev-home/portal-migrate-job.yaml +++ b/services/bstein-dev-home/portal-migrate-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: bstein-dev-home-portal-migrate + name: bstein-dev-home-portal-migrate-36 namespace: bstein-dev-home annotations: kustomize.toolkit.fluxcd.io/force: "true" From 3d4208f87717931c3e2db6dbf70d3db64dd373ab Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 06:13:15 +0000 Subject: [PATCH 170/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 45a2d81..41ad3e5 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From ca47e03953e67a4c37732b1b87fa323b11c51796 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 06:14:16 +0000 Subject: [PATCH 171/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 41ad3e5..ea326a2 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From fda986ab3d943a696ab2e049d1ba1246cf38a528 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 03:28:26 -0300 Subject: [PATCH 172/416] bstein-dev-home: separate portal migrations --- .../kustomization.yaml | 16 ++++++++++++++++ .../flux-system/applications/kustomization.yaml | 1 + services/bstein-dev-home/kustomization.yaml | 1 - .../migrations/kustomization.yaml | 6 ++++++ .../{ => migrations}/portal-migrate-job.yaml | 2 +- 5 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml create mode 100644 services/bstein-dev-home/migrations/kustomization.yaml rename services/bstein-dev-home/{ => migrations}/portal-migrate-job.yaml (95%) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml new file mode 100644 index 0000000..f962de0 --- /dev/null +++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml @@ -0,0 +1,16 @@ +# clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: bstein-dev-home-migrations + namespace: flux-system +spec: + interval: 10m + path: ./services/bstein-dev-home/migrations + prune: true + sourceRef: + kind: GitRepository + name: flux-system + targetNamespace: bstein-dev-home + wait: false + suspend: true diff --git a/clusters/atlas/flux-system/applications/kustomization.yaml b/clusters/atlas/flux-system/applications/kustomization.yaml index 417a3ec..10c203d 100644 --- a/clusters/atlas/flux-system/applications/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/kustomization.yaml @@ -12,6 +12,7 @@ resources: - pegasus/image-automation.yaml - bstein-dev-home/kustomization.yaml - bstein-dev-home/image-automation.yaml + - bstein-dev-home-migrations/kustomization.yaml - harbor/kustomization.yaml - harbor/image-automation.yaml - jellyfin/kustomization.yaml diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index ea326a2..e6a744f 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -15,7 +15,6 @@ resources: - frontend-service.yaml - backend-deployment.yaml - backend-service.yaml - - portal-migrate-job.yaml - vaultwarden-cred-sync-cronjob.yaml - portal-onboarding-e2e-test-job.yaml - ingress.yaml diff --git a/services/bstein-dev-home/migrations/kustomization.yaml b/services/bstein-dev-home/migrations/kustomization.yaml new file mode 100644 index 0000000..067665b --- /dev/null +++ b/services/bstein-dev-home/migrations/kustomization.yaml @@ -0,0 +1,6 @@ +# services/bstein-dev-home/migrations/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: bstein-dev-home +resources: + - portal-migrate-job.yaml diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/migrations/portal-migrate-job.yaml similarity index 95% rename from services/bstein-dev-home/portal-migrate-job.yaml rename to services/bstein-dev-home/migrations/portal-migrate-job.yaml index 2cb2a12..9d05254 100644 --- a/services/bstein-dev-home/portal-migrate-job.yaml +++ b/services/bstein-dev-home/migrations/portal-migrate-job.yaml @@ -1,4 +1,4 @@ -# services/bstein-dev-home/portal-migrate-job.yaml +# services/bstein-dev-home/migrations/portal-migrate-job.yaml apiVersion: batch/v1 kind: Job metadata: From df3f4a0c0bf0b083a72474371dd9500d93afbf0f Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 06:44:18 +0000 Subject: [PATCH 173/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index e6a744f..f705c4e 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From ef42dac97ba284b4db68017954c1cb6554994ba6 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 06:45:19 +0000 Subject: [PATCH 174/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f705c4e..94239e3 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From a988af3262325b58ee0e4cf14a24a526274fb030 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 11:50:55 -0300 Subject: [PATCH 175/416] monitoring: alert on VM outage --- .../vault-csi/secrets-store-csi-driver.yaml | 3 +- .../monitoring/grafana-alerting-config.yaml | 53 +++++++++++++++++++ services/monitoring/helmrelease.yaml | 2 +- 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/infrastructure/vault-csi/secrets-store-csi-driver.yaml b/infrastructure/vault-csi/secrets-store-csi-driver.yaml index 0b249fc..0004c0d 100644 --- a/infrastructure/vault-csi/secrets-store-csi-driver.yaml +++ b/infrastructure/vault-csi/secrets-store-csi-driver.yaml @@ -17,4 +17,5 @@ spec: values: syncSecret: enabled: true - enableSecretRotation: false + enableSecretRotation: true + rotationPollInterval: 2m diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index daa1e29..8713d3d 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -180,6 +180,59 @@ data: summary: "{{ $labels.instance }} CPU >90% for 10m" labels: severity: warning + - orgId: 1 + name: atlas-metrics + folder: Alerts + interval: 1m + rules: + - uid: victoria-metrics-down + title: "VictoriaMetrics unavailable (>30m)" + condition: C + for: "30m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: sum(up{job="victoriametrics"}) + legendFormat: victoriametrics + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: Alerting + execErrState: Alerting + annotations: + summary: "VictoriaMetrics is unavailable for >30m" + labels: + severity: critical - orgId: 1 name: maintenance folder: Alerts diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index ac24f8a..8e225d4 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -342,7 +342,7 @@ spec: GF_SMTP_HOST: "mail.bstein.dev:587" GF_SMTP_FROM: "no-reply-grafana@bstein.dev" GF_SMTP_FROM_NAME: "Atlas Grafana" - GRAFANA_ALERT_EMAILS: "alerts@bstein.dev" + GRAFANA_ALERT_EMAILS: "brad@bstein.dev" GF_SECURITY_ALLOW_EMBEDDING: "true" GF_AUTH_GENERIC_OAUTH_ENABLED: "true" GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak" From 3338efa58e42e934a260b901bf4e6a48901d325b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 14:07:52 -0300 Subject: [PATCH 176/416] finance: allow actual user creation --- services/finance/actual-budget-deployment.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/services/finance/actual-budget-deployment.yaml b/services/finance/actual-budget-deployment.yaml index 55186b2..637e9ae 100644 --- a/services/finance/actual-budget-deployment.yaml +++ b/services/finance/actual-budget-deployment.yaml @@ -90,6 +90,8 @@ spec: value: openid - name: ACTUAL_MULTIUSER value: "true" + - name: ACTUAL_USER_CREATION_MODE + value: login - name: ACTUAL_OPENID_DISCOVERY_URL value: https://sso.bstein.dev/realms/atlas - name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT @@ -128,6 +130,8 @@ spec: value: openid - name: ACTUAL_MULTIUSER value: "true" + - name: ACTUAL_USER_CREATION_MODE + value: login - name: ACTUAL_OPENID_DISCOVERY_URL value: https://sso.bstein.dev/realms/atlas - name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT From 60840d1171f53f5db01be4cd2662690b1138356b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 19:11:58 +0000 Subject: [PATCH 177/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 94239e3..5d2a1fd 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 00bcc0d4c2b8273880096dd56bf6570f46ae60c6 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 19:13:56 +0000 Subject: [PATCH 178/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 5d2a1fd..23381a0 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 0758c2e06dd424e90801c237e402702d41e51dc1 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 19:56:31 +0000 Subject: [PATCH 179/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 6f5b7dc..617b715 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-38 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-39 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 8bdf60542d4e02a290f679389be385e88732df15 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 19:58:00 +0000 Subject: [PATCH 180/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 23381a0..4007b7d 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From c28444a2331585f3aa2a446032373c7ca8df68b4 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 20:00:01 +0000 Subject: [PATCH 181/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 4007b7d..e43647c 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From d521c66d603c5eed25c76bc0d0a46ff62ba4da41 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 17:21:18 -0300 Subject: [PATCH 182/416] maintenance: rotate ariadne migrate job name --- services/maintenance/ariadne-migrate-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml index 3528f9b..b9b1496 100644 --- a/services/maintenance/ariadne-migrate-job.yaml +++ b/services/maintenance/ariadne-migrate-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: ariadne-migrate + name: ariadne-migrate-2 namespace: maintenance annotations: kustomize.toolkit.fluxcd.io/force: "true" From ee6ef749826ff36a3c77d8cb8503d60d967ed7d4 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 20:47:05 +0000 Subject: [PATCH 183/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index e43647c..1642cbe 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 0127c62f51f5ce3981b77ca141cf71f382f12823 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 20:48:05 +0000 Subject: [PATCH 184/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 1642cbe..9f989fd 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 7b336c76a14736746c40c65966aadb68e7d1e259 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 20:50:05 +0000 Subject: [PATCH 185/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 9f989fd..b11cb44 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 45352f79ba4be30f99036de41b23ab77f0f9a5bc Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 20:51:05 +0000 Subject: [PATCH 186/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index b11cb44..0039328 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From bc2e1058d62ed7e7c2566cbe2f5ee82986ed7736 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 21:27:08 +0000 Subject: [PATCH 187/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 0039328..a5482c0 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From e2501bd3d069e2e222c5db3d0a5d025b2a3f980c Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 21:28:08 +0000 Subject: [PATCH 188/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index a5482c0..1718603 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 5ec4bb9c61e7188bcc99df59822d972eb0bae10e Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 21:44:40 +0000 Subject: [PATCH 189/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 617b715..0963606 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-39 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-43 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From b94b016b0f64482249111b3a93f7ad724754da6c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 18:58:14 -0300 Subject: [PATCH 190/416] flux: force apply migrations --- .../applications/bstein-dev-home-migrations/kustomization.yaml | 1 + .../atlas/flux-system/platform/maintenance/kustomization.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml index f962de0..da61b2d 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml @@ -8,6 +8,7 @@ spec: interval: 10m path: ./services/bstein-dev-home/migrations prune: true + force: true sourceRef: kind: GitRepository name: flux-system diff --git a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml index fc655a4..8477ec9 100644 --- a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml @@ -8,6 +8,7 @@ spec: interval: 10m path: ./services/maintenance prune: true + force: true sourceRef: kind: GitRepository name: flux-system From d9c3ff81950c78cf83c389cd5937cf7b7ebe83f0 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 22:21:43 +0000 Subject: [PATCH 191/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 0963606..18d0008 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-43 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-44 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From c24f2dafc117523706a12c38c2319eddcdff1e7d Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 22:24:13 +0000 Subject: [PATCH 192/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 1718603..487fa64 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From aaef2b7ab567a26ec54e7466cedddef508e3e52f Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 22:25:15 +0000 Subject: [PATCH 193/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 487fa64..a3914b5 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 115f86907f1466f9e95581577d8837fc1d0d9379 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 22:39:15 +0000 Subject: [PATCH 194/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index a3914b5..a58bea7 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From e43340f2a1f75a1ff5bac527143d9c8a7b17c394 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 22:40:15 +0000 Subject: [PATCH 195/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index a58bea7..ab69f05 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From a603b3726fd9efcad04109ef4d208f0c22ac54a1 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:19:18 +0000 Subject: [PATCH 196/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index ab69f05..2fe7ad2 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 8e6d9e1c37145cfcc70ff042ce3f96393869a779 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:19:21 +0000 Subject: [PATCH 197/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 2fe7ad2..06829f6 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 82fceb11a43513e312fad3f33bd8c8a73cbc26e9 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:28:20 +0000 Subject: [PATCH 198/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 06829f6..655cfae 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 38d2dad28f7d1d441e9741d34f562764ff71ba67 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:28:28 +0000 Subject: [PATCH 199/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 655cfae..3370bb1 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 67643e3fad042a5c9aa94ca40d6ece3941ba2490 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:52:21 +0000 Subject: [PATCH 200/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 3370bb1..9c95b90 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 437281f6a5f8ad66d4756c63538519784083b3c4 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:53:21 +0000 Subject: [PATCH 201/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 9c95b90..0fa4611 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From e7d18be4edec711edb35342a3529bbd5319b911b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 22:30:50 -0300 Subject: [PATCH 202/416] keycloak: add vaultwarden_grandfathered flag --- services/keycloak/realm-settings-job.yaml | 1 + services/maintenance/ariadne-deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index 0de48d1..74f569b 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -333,6 +333,7 @@ spec: ensure_group("admin") ensure_group("demo") ensure_group("test") + ensure_group("vaultwarden_grandfathered") planka_group = ensure_group("planka-users") if planka_group and planka_group.get("id"): diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 581947c..52d10f9 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -150,7 +150,7 @@ spec: - name: ACCOUNT_ALLOWED_GROUPS value: dev,admin - name: ALLOWED_FLAG_GROUPS - value: demo,test + value: demo,test,vaultwarden_grandfathered - name: DEFAULT_USER_GROUPS value: dev - name: MAILU_DOMAIN From d07f14826b80e78796f30f86502a4c9142584efe Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 01:33:29 +0000 Subject: [PATCH 203/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 0fa4611..550a7a8 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 473bebaf52fb04bff3c184e1a1edf2f182a93a82 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 01:33:33 +0000 Subject: [PATCH 204/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 550a7a8..efed9a3 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 292ec7359b64bc70340af15a21061c64d33834e4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 22:41:20 -0300 Subject: [PATCH 205/416] keycloak: rerun realm settings job --- services/keycloak/realm-settings-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index 74f569b..9265ca3 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: keycloak-realm-settings-35 + name: keycloak-realm-settings-36 namespace: sso spec: backoffLimit: 0 From 82312d0fbf673f48086b02538afe962478f7c018 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 02:05:32 +0000 Subject: [PATCH 206/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index efed9a3..f38bd96 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From eeb84e8e70b4c2134d51b7bf3645b10f6ad14bc5 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 02:07:32 +0000 Subject: [PATCH 207/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f38bd96..276c82f 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From a7f5a601907c0b53691c31d1ea8a7fd0a008e07d Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 09:29:39 +0000 Subject: [PATCH 208/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 18d0008..b351615 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-44 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-47 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 8b1b824a2972500e53e3a4999731df734cc05c54 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 10:13:43 +0000 Subject: [PATCH 209/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index b351615..4e261cb 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-47 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-48 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 00eb4be529ffbe31215c38154a6f9e994bc1c7c0 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 10:15:15 +0000 Subject: [PATCH 210/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 276c82f..d7cbaf7 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From cf2d0c5eff0267a37a333a835dc27c9e98c7e516 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 10:16:15 +0000 Subject: [PATCH 211/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d7cbaf7..cab14d7 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From a27bb0e198d1deb6061a4c154f5a3388c429b646 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 14:31:37 +0000 Subject: [PATCH 212/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index cab14d7..fad8534 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 0b7d87cef4b458a94066ff17a4f5a8c01d9ab97b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 14:32:37 +0000 Subject: [PATCH 213/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index fad8534..4b21d1e 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 6e4e2bdc0c371706415bcbe80d05338e4bccd843 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 14:44:38 +0000 Subject: [PATCH 214/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 4b21d1e..60db96a 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From d0c69cd480897ec158a5550e740775c112f29997 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 14:46:38 +0000 Subject: [PATCH 215/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 60db96a..9d34348 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From ee154f14945587b2f08219636103b6269e8634ee Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 24 Jan 2026 14:16:36 -0300 Subject: [PATCH 216/416] vaultwarden: bump to 1.35.2 --- knowledge/catalog/atlas.json | 2 +- knowledge/catalog/atlas.yaml | 2 +- services/comms/knowledge/catalog/atlas.json | 2 +- services/comms/knowledge/catalog/atlas.yaml | 2 +- services/vaultwarden/deployment.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json index 0d97bcd..18cb6b6 100644 --- a/knowledge/catalog/atlas.json +++ b/knowledge/catalog/atlas.json @@ -998,7 +998,7 @@ "serviceAccountName": null, "nodeSelector": {}, "images": [ - "vaultwarden/server:1.33.2" + "vaultwarden/server:1.35.2" ] } ], diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml index f3e04a8..580a331 100644 --- a/knowledge/catalog/atlas.yaml +++ b/knowledge/catalog/atlas.yaml @@ -672,7 +672,7 @@ workloads: serviceAccountName: null nodeSelector: {} images: - - vaultwarden/server:1.33.2 + - vaultwarden/server:1.35.2 services: - namespace: ai name: ollama diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json index 0d97bcd..18cb6b6 100644 --- a/services/comms/knowledge/catalog/atlas.json +++ b/services/comms/knowledge/catalog/atlas.json @@ -998,7 +998,7 @@ "serviceAccountName": null, "nodeSelector": {}, "images": [ - "vaultwarden/server:1.33.2" + "vaultwarden/server:1.35.2" ] } ], diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml index 6529e1a..67f2fcb 100644 --- a/services/comms/knowledge/catalog/atlas.yaml +++ b/services/comms/knowledge/catalog/atlas.yaml @@ -672,7 +672,7 @@ workloads: serviceAccountName: null nodeSelector: {} images: - - vaultwarden/server:1.33.2 + - vaultwarden/server:1.35.2 services: - namespace: ai name: ollama diff --git a/services/vaultwarden/deployment.yaml b/services/vaultwarden/deployment.yaml index 2893a92..e1d888a 100644 --- a/services/vaultwarden/deployment.yaml +++ b/services/vaultwarden/deployment.yaml @@ -39,7 +39,7 @@ spec: node-role.kubernetes.io/worker: "true" containers: - name: vaultwarden - image: vaultwarden/server:1.33.2 + image: vaultwarden/server:1.35.2 command: ["/bin/sh", "-c"] args: - >- From f471a3049935282bd6de90489e12e0b238d5e207 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 00:06:26 +0000 Subject: [PATCH 217/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 9d34348..63eaebf 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 70e79f25b00923dd3857e49bd27b2772d461bc24 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 00:07:26 +0000 Subject: [PATCH 218/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 63eaebf..1511f5c 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 39df6ff03945a00a6926f20fca10d5b12e777722 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 17:39:57 +0000 Subject: [PATCH 219/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 1511f5c..7ed1b52 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From af9fcdeae96a4db2b277c0e46e6853e16fc23e53 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 17:40:57 +0000 Subject: [PATCH 220/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7ed1b52..0890f59 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 5a994f4d42aa1b9b8c3ebdf39dedc81d8ea20d3e Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 18:04:59 +0000 Subject: [PATCH 221/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 0890f59..c0aff7f 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 04465407d242b4f180efa6210578f29172974ebb Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 18:06:59 +0000 Subject: [PATCH 222/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index c0aff7f..90c3b8d 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From ec6b51cfd246e975af8b3c7d5a1886cf116e0c31 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 25 Jan 2026 15:59:12 -0300 Subject: [PATCH 223/416] comms: route atlasbot to chat gateway --- services/bstein-dev-home/chat-ai-gateway-deployment.yaml | 2 +- services/comms/atlasbot-deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml index 3010a9b..40d74fe 100644 --- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml +++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: chat-ai-gateway namespace: bstein-dev-home spec: - replicas: 0 + replicas: 1 revisionHistoryLimit: 2 selector: matchLabels: diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4618053..278a008 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -76,7 +76,7 @@ spec: - name: BOT_USER value: atlasbot - name: OLLAMA_URL - value: https://chat.ai.bstein.dev/ + value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL value: qwen2.5-coder:7b-instruct-q4_0 resources: From 83b8e13661df0b711c285737e6aabaca46c15983 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 25 Jan 2026 16:19:15 -0300 Subject: [PATCH 224/416] ai: restart ollama deployment --- services/ai-llm/deployment.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml index fa35440..dfa1bdd 100644 --- a/services/ai-llm/deployment.yaml +++ b/services/ai-llm/deployment.yaml @@ -22,6 +22,7 @@ spec: annotations: ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0 ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24) + ai.bstein.dev/restartedAt: "2026-01-25T19:10:00Z" spec: affinity: nodeAffinity: From cd6eaff7cbf0e3fc8011683d42bc50607e357b2a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 00:52:35 -0300 Subject: [PATCH 225/416] comms: normalize atlasbot replies --- services/comms/atlasbot-deployment.yaml | 4 ++- services/comms/scripts/atlasbot/bot.py | 34 ++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 278a008..c2bc108 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-4 + checksum/atlasbot-configmap: manual-atlasbot-5 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -75,6 +75,8 @@ spec: value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 - name: BOT_USER value: atlasbot + - name: BOT_MENTIONS + value: atlasbot - name: OLLAMA_URL value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e8bd1a8..3da93ba 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -71,6 +71,8 @@ METRIC_HINT_WORDS = { "latency", } +CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL) + def _tokens(text: str) -> list[str]: toks = [t.lower() for t in TOKEN_RE.findall(text or "")] return [t for t in toks if t not in STOPWORDS and len(t) >= 2] @@ -442,6 +444,35 @@ def vm_cluster_snapshot() -> str: parts.append(pr) return "\n".join(parts).strip() +def _strip_code_fence(text: str) -> str: + cleaned = (text or "").strip() + match = CODE_FENCE_RE.match(cleaned) + if match: + return match.group(1).strip() + return cleaned + +def _normalize_reply(value: Any) -> str: + if isinstance(value, dict): + for key in ("content", "response", "reply", "message"): + if key in value: + return _normalize_reply(value[key]) + for v in value.values(): + if isinstance(v, (str, dict, list)): + return _normalize_reply(v) + return json.dumps(value, ensure_ascii=False) + if isinstance(value, list): + parts = [_normalize_reply(item) for item in value] + return " ".join(p for p in parts if p) + if value is None: + return "" + text = _strip_code_fence(str(value)) + if text.startswith("{") and text.endswith("}"): + try: + return _normalize_reply(json.loads(text)) + except Exception: + return text + return text + # Conversation state. history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] (short transcript) @@ -511,7 +542,8 @@ def ollama_reply(hist_key, prompt: str, *, context: str) -> str: r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) with request.urlopen(r, timeout=20) as resp: data = json.loads(resp.read().decode()) - reply = data.get("message") or data.get("response") or data.get("reply") or "I'm here to help." + raw_reply = data.get("message") or data.get("response") or data.get("reply") or data + reply = _normalize_reply(raw_reply) or "I'm here to help." history[hist_key].append(f"Atlas: {reply}") return reply except Exception: From 5f0bc3832d86da5e8a034034ad1ebc02bee69cfa Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 01:07:49 -0300 Subject: [PATCH 226/416] comms: answer node count queries --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 33 ++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index c2bc108..7a258ac 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-5 + checksum/atlasbot-configmap: manual-atlasbot-6 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 3da93ba..69c1b84 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -444,6 +444,28 @@ def vm_cluster_snapshot() -> str: parts.append(pr) return "\n".join(parts).strip() +def nodes_summary(cluster_name: str) -> str: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return "" + items = data.get("items") or [] + if not isinstance(items, list) or not items: + return "" + total = len(items) + ready = 0 + for node in items: + conditions = node.get("status", {}).get("conditions") or [] + for cond in conditions if isinstance(conditions, list) else []: + if cond.get("type") == "Ready": + if cond.get("status") == "True": + ready += 1 + break + not_ready = max(total - ready, 0) + if not_ready: + return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady." + return f"{cluster_name} cluster has {total} nodes, all Ready." + def _strip_code_fence(text: str) -> str: cleaned = (text or "").strip() match = CODE_FENCE_RE.match(cleaned) @@ -526,7 +548,8 @@ def ollama_reply(hist_key, prompt: str, *, context: str) -> str: "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " "Prefer answering with exact repo paths and Kubernetes resource names. " - "Never include or request secret values." + "Never include or request secret values. " + "Respond in plain sentences; do not return JSON or code fences unless explicitly asked." ) transcript_parts = [system] if context: @@ -601,6 +624,14 @@ def sync_loop(token: str, room_id: str): if not (is_dm or mentioned): continue + lower_body = body.lower() + if re.search(r"\\bhow many nodes\\b|\\bnode count\\b|\\bnumber of nodes\\b", lower_body): + if any(word in lower_body for word in ("cluster", "atlas", "titan")): + summary = nodes_summary("Atlas") + if summary: + send_msg(token, rid, summary) + continue + # Only do live cluster/metrics introspection in DMs. allow_tools = is_dm From 36f7de76e97b30a4d89e84abb5c946a92c3f729a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 01:32:01 -0300 Subject: [PATCH 227/416] comms: fix atlasbot node count matcher --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7a258ac..fe1e906 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-6 + checksum/atlasbot-configmap: manual-atlasbot-7 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 69c1b84..b2ac1c9 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -625,12 +625,14 @@ def sync_loop(token: str, room_id: str): continue lower_body = body.lower() - if re.search(r"\\bhow many nodes\\b|\\bnode count\\b|\\bnumber of nodes\\b", lower_body): + if re.search(r"\bhow many nodes\b|\bnode count\b|\bnumber of nodes\b", lower_body): if any(word in lower_body for word in ("cluster", "atlas", "titan")): summary = nodes_summary("Atlas") - if summary: - send_msg(token, rid, summary) + if not summary: + send_msg(token, rid, "I couldn’t reach the cluster API to count nodes. Try again in a moment.") continue + send_msg(token, rid, summary) + continue # Only do live cluster/metrics introspection in DMs. allow_tools = is_dm From 5aac018a7b85d2703c9ae7b2b8448148c529bddf Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 01:35:47 -0300 Subject: [PATCH 228/416] comms: answer node name queries --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 29 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index fe1e906..7aedf4a 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-7 + checksum/atlasbot-configmap: manual-atlasbot-8 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index b2ac1c9..6fb6bff 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -466,6 +466,27 @@ def nodes_summary(cluster_name: str) -> str: return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady." return f"{cluster_name} cluster has {total} nodes, all Ready." +def nodes_names_summary(cluster_name: str) -> str: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return "" + items = data.get("items") or [] + if not isinstance(items, list) or not items: + return "" + names = [] + for node in items: + name = (node.get("metadata") or {}).get("name") or "" + if name: + names.append(name) + names = sorted(set(names)) + if not names: + return "" + if len(names) <= 30: + return f"{cluster_name} node names: {', '.join(names)}." + shown = ", ".join(names[:30]) + return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)." + def _strip_code_fence(text: str) -> str: cleaned = (text or "").strip() match = CODE_FENCE_RE.match(cleaned) @@ -633,6 +654,14 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue + if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body): + if any(word in lower_body for word in ("cluster", "atlas", "titan")): + names_summary = nodes_names_summary("Atlas") + if not names_summary: + send_msg(token, rid, "I couldn’t reach the cluster API to list node names. Try again in a moment.") + continue + send_msg(token, rid, names_summary) + continue # Only do live cluster/metrics introspection in DMs. allow_tools = is_dm From 10003ca0d79841a1c75b81ce1821f7c6fec20ab1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 03:32:17 -0300 Subject: [PATCH 229/416] comms: sync atlas knowledge and use ariadne state --- knowledge/catalog/atlas-summary.json | 8 +- knowledge/catalog/atlas.json | 706 ++++++++++++++++-- knowledge/catalog/atlas.yaml | 494 ++++++++++-- knowledge/diagrams/atlas-http.mmd | 43 +- scripts/knowledge_render_atlas.py | 17 + services/comms/atlasbot-deployment.yaml | 4 +- .../knowledge/catalog/atlas-summary.json | 8 +- services/comms/knowledge/catalog/atlas.json | 706 ++++++++++++++++-- services/comms/knowledge/catalog/atlas.yaml | 496 ++++++++++-- .../comms/knowledge/catalog/runbooks.json | 16 + .../comms/knowledge/diagrams/atlas-http.mmd | 43 +- services/comms/knowledge/metis.md | 26 + .../comms/knowledge/runbooks/comms-verify.md | 30 + services/comms/knowledge/software/metis.md | 73 ++ services/comms/scripts/atlasbot/bot.py | 38 + services/maintenance/ariadne-deployment.yaml | 8 + services/maintenance/ariadne-rbac.yaml | 15 + 17 files changed, 2453 insertions(+), 278 deletions(-) create mode 100644 services/comms/knowledge/metis.md create mode 100644 services/comms/knowledge/runbooks/comms-verify.md create mode 100644 services/comms/knowledge/software/metis.md diff --git a/knowledge/catalog/atlas-summary.json b/knowledge/catalog/atlas-summary.json index fa35051..ea825ce 100644 --- a/knowledge/catalog/atlas-summary.json +++ b/knowledge/catalog/atlas-summary.json @@ -1,8 +1,8 @@ { "counts": { - "helmrelease_host_hints": 17, - "http_endpoints": 37, - "services": 43, - "workloads": 54 + "helmrelease_host_hints": 19, + "http_endpoints": 45, + "services": 47, + "workloads": 74 } } diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json index 18cb6b6..21ac407 100644 --- a/knowledge/catalog/atlas.json +++ b/knowledge/catalog/atlas.json @@ -11,6 +11,21 @@ "path": "services/bstein-dev-home", "targetNamespace": "bstein-dev-home" }, + { + "name": "bstein-dev-home-migrations", + "path": "services/bstein-dev-home/migrations", + "targetNamespace": "bstein-dev-home" + }, + { + "name": "cert-manager", + "path": "infrastructure/cert-manager", + "targetNamespace": "cert-manager" + }, + { + "name": "cert-manager-cleanup", + "path": "infrastructure/cert-manager/cleanup", + "targetNamespace": "cert-manager" + }, { "name": "comms", "path": "services/comms", @@ -26,6 +41,11 @@ "path": "services/crypto", "targetNamespace": "crypto" }, + { + "name": "finance", + "path": "services/finance", + "targetNamespace": "finance" + }, { "name": "flux-system", "path": "clusters/atlas/flux-system", @@ -46,6 +66,11 @@ "path": "services/harbor", "targetNamespace": "harbor" }, + { + "name": "health", + "path": "services/health", + "targetNamespace": "health" + }, { "name": "helm", "path": "infrastructure/sources/helm", @@ -71,6 +96,16 @@ "path": "services/logging", "targetNamespace": null }, + { + "name": "longhorn", + "path": "infrastructure/longhorn/core", + "targetNamespace": "longhorn-system" + }, + { + "name": "longhorn-adopt", + "path": "infrastructure/longhorn/adopt", + "targetNamespace": "longhorn-system" + }, { "name": "longhorn-ui", "path": "infrastructure/longhorn/ui-ingress", @@ -161,11 +196,21 @@ "path": "infrastructure/vault-csi", "targetNamespace": "kube-system" }, + { + "name": "vault-injector", + "path": "infrastructure/vault-injector", + "targetNamespace": "vault" + }, { "name": "vaultwarden", "path": "services/vaultwarden", "targetNamespace": "vaultwarden" }, + { + "name": "wallet-monero-temp", + "path": "services/crypto/wallet-monero-temp", + "targetNamespace": "crypto" + }, { "name": "xmr-miner", "path": "services/crypto/xmr-miner", @@ -199,7 +244,7 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157" ] }, { @@ -215,7 +260,20 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-vault-sync", + "labels": { + "app": "bstein-dev-home-vault-sync" + }, + "serviceAccountName": "bstein-dev-home-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -225,7 +283,7 @@ "labels": { "app": "chat-ai-gateway" }, - "serviceAccountName": null, + "serviceAccountName": "bstein-dev-home", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -249,6 +307,19 @@ "python:3.11-slim" ] }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "comms-vault-sync", + "labels": { + "app": "comms-vault-sync" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "comms", @@ -256,7 +327,7 @@ "labels": { "app": "coturn" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -286,7 +357,7 @@ "labels": { "app": "livekit" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -301,12 +372,12 @@ "labels": { "app": "livekit-token-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, "images": [ - "ghcr.io/element-hq/lk-jwt-service:0.3.0" + "registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0" ] }, { @@ -316,7 +387,7 @@ "labels": { "app": "matrix-authentication-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -331,7 +402,7 @@ "labels": { "app.kubernetes.io/name": "matrix-guest-register" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": {}, "images": [ "python:3.11-slim" @@ -365,6 +436,19 @@ "ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "crypto-vault-sync", + "labels": { + "app": "crypto-vault-sync" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "crypto", @@ -372,7 +456,7 @@ "labels": { "app": "monero-p2pool" }, - "serviceAccountName": null, + "serviceAccountName": "crypto-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -395,6 +479,53 @@ "registry.bstein.dev/crypto/monerod:0.18.4.1" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "wallet-monero-temp", + "labels": { + "app": "wallet-monero-temp" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "actual-budget", + "labels": { + "app": "actual-budget" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "firefly", + "labels": { + "app": "firefly" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "fireflyiii/core:version-6.4.15" + ] + }, { "kind": "Deployment", "namespace": "flux-system", @@ -516,7 +647,7 @@ "labels": { "app": "gitea" }, - "serviceAccountName": null, + "serviceAccountName": "gitea-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -524,6 +655,36 @@ "gitea/gitea:1.23" ] }, + { + "kind": "Deployment", + "namespace": "harbor", + "name": "harbor-vault-sync", + "labels": { + "app": "harbor-vault-sync" + }, + "serviceAccountName": "harbor-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "health", + "name": "wger", + "labels": { + "app": "wger" + }, + "serviceAccountName": "health-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10", + "wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5" + ] + }, { "kind": "Deployment", "namespace": "jellyfin", @@ -531,7 +692,7 @@ "labels": { "app": "jellyfin" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": {}, "images": [ "docker.io/jellyfin/jellyfin:10.11.5" @@ -544,14 +705,27 @@ "labels": { "app": "pegasus" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" }, "images": [ "alpine:3.20", - "registry.bstein.dev/streaming/pegasus:1.2.32" + "registry.bstein.dev/streaming/pegasus-vault:1.2.32" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "pegasus-vault-sync", + "labels": { + "app": "pegasus-vault-sync" + }, + "serviceAccountName": "pegasus-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -570,6 +744,35 @@ "jenkins/jenkins:2.528.3-jdk21" ] }, + { + "kind": "Deployment", + "namespace": "jenkins", + "name": "jenkins-vault-sync", + "labels": { + "app": "jenkins-vault-sync" + }, + "serviceAccountName": "jenkins-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "ntp-sync", + "labels": { + "app": "ntp-sync" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "public.ecr.aws/docker/library/busybox:1.36.1" + ] + }, { "kind": "DaemonSet", "namespace": "kube-system", @@ -636,6 +839,21 @@ "hashicorp/vault-csi-provider:1.7.0" ] }, + { + "kind": "Deployment", + "namespace": "kube-system", + "name": "coredns", + "labels": { + "k8s-app": "kube-dns" + }, + "serviceAccountName": "coredns", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "registry.bstein.dev/infra/coredns:1.12.1" + ] + }, { "kind": "DaemonSet", "namespace": "logging", @@ -681,6 +899,19 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "logging", + "name": "logging-vault-sync", + "labels": { + "app": "logging-vault-sync" + }, + "serviceAccountName": "logging-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "logging", @@ -688,12 +919,27 @@ "labels": { "app": "oauth2-proxy-logs" }, - "serviceAccountName": null, + "serviceAccountName": "logging-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "longhorn-system", + "name": "longhorn-vault-sync", + "labels": { + "app": "longhorn-vault-sync" + }, + "serviceAccountName": "longhorn-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" ] }, { @@ -703,7 +949,7 @@ "labels": { "app": "oauth2-proxy-longhorn" }, - "serviceAccountName": null, + "serviceAccountName": "longhorn-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -729,14 +975,45 @@ { "kind": "Deployment", "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "name": "mailu-vault-sync", "labels": { - "app": "mailu-sync-listener" + "app": "mailu-vault-sync" }, - "serviceAccountName": null, + "serviceAccountName": "mailu-vault-sync", "nodeSelector": {}, "images": [ - "python:3.11-alpine" + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "disable-k3s-traefik", + "labels": { + "app": "disable-k3s-traefik" + }, + "serviceAccountName": "disable-k3s-traefik", + "nodeSelector": { + "node-role.kubernetes.io/control-plane": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "k3s-agent-restart", + "labels": { + "app": "k3s-agent-restart" + }, + "serviceAccountName": "node-nofile", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, { @@ -767,6 +1044,35 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "ariadne", + "labels": { + "app": "ariadne" + }, + "serviceAccountName": "ariadne", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/ariadne:0.1.0-48" + ] + }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "maintenance-vault-sync", + "labels": { + "app": "maintenance-vault-sync" + }, + "serviceAccountName": "maintenance-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "DaemonSet", "namespace": "monitoring", @@ -795,6 +1101,19 @@ "python:3.10-slim" ] }, + { + "kind": "Deployment", + "namespace": "monitoring", + "name": "monitoring-vault-sync", + "labels": { + "app": "monitoring-vault-sync" + }, + "serviceAccountName": "monitoring-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "monitoring", @@ -802,7 +1121,7 @@ "labels": { "app": "postmark-exporter" }, - "serviceAccountName": null, + "serviceAccountName": "monitoring-vault-sync", "nodeSelector": {}, "images": [ "python:3.12-alpine" @@ -830,7 +1149,7 @@ "labels": { "app": "nextcloud" }, - "serviceAccountName": null, + "serviceAccountName": "nextcloud-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -845,7 +1164,7 @@ "labels": { "app": "outline" }, - "serviceAccountName": null, + "serviceAccountName": "outline-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -875,7 +1194,7 @@ "labels": { "app": "planka" }, - "serviceAccountName": null, + "serviceAccountName": "planka-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -895,7 +1214,8 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "postgres:15" + "postgres:15", + "quay.io/prometheuscommunity/postgres-exporter:v0.15.0" ] }, { @@ -905,8 +1225,11 @@ "labels": { "app": "keycloak" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "sso-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ "quay.io/keycloak/keycloak:26.0.7" ] @@ -918,12 +1241,25 @@ "labels": { "app": "oauth2-proxy" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "sso-vault-sync", + "labels": { + "app": "sso-vault-sync" + }, + "serviceAccountName": "sso-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -933,7 +1269,7 @@ "labels": { "app": "openldap" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -951,7 +1287,7 @@ }, "serviceAccountName": "sui-metrics", "nodeSelector": { - "kubernetes.io/hostname": "titan-24" + "hardware": "rpi5" }, "images": [ "victoriametrics/vmagent:v1.103.0" @@ -962,7 +1298,9 @@ "namespace": "traefik", "name": "traefik", "labels": { - "app": "traefik" + "app": "traefik", + "app.kubernetes.io/instance": "traefik-kube-system", + "app.kubernetes.io/name": "traefik" }, "serviceAccountName": "traefik-ingress-controller", "nodeSelector": { @@ -995,8 +1333,11 @@ "labels": { "app": "vaultwarden" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "vaultwarden-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ "vaultwarden/server:1.35.2" ] @@ -1565,6 +1906,54 @@ } ] }, + { + "namespace": "crypto", + "name": "wallet-monero-temp", + "type": "ClusterIP", + "selector": { + "app": "wallet-monero-temp" + }, + "ports": [ + { + "name": "rpc", + "port": 18083, + "targetPort": 18083, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "actual-budget", + "type": "ClusterIP", + "selector": { + "app": "actual-budget" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 5006, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "firefly", + "type": "ClusterIP", + "selector": { + "app": "firefly" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, { "namespace": "flux-system", "name": "notification-controller", @@ -1632,7 +2021,7 @@ { "namespace": "gitea", "name": "gitea-ssh", - "type": "NodePort", + "type": "LoadBalancer", "selector": { "app": "gitea" }, @@ -1645,6 +2034,22 @@ } ] }, + { + "namespace": "health", + "name": "wger", + "type": "ClusterIP", + "selector": { + "app": "wger" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, { "namespace": "jellyfin", "name": "jellyfin", @@ -1699,29 +2104,6 @@ } ] }, - { - "namespace": "kube-system", - "name": "traefik", - "type": "LoadBalancer", - "selector": { - "app.kubernetes.io/instance": "traefik-kube-system", - "app.kubernetes.io/name": "traefik" - }, - "ports": [ - { - "name": "web", - "port": 80, - "targetPort": "web", - "protocol": "TCP" - }, - { - "name": "websecure", - "port": 443, - "targetPort": "websecure", - "protocol": "TCP" - } - ] - }, { "namespace": "logging", "name": "oauth2-proxy-logs", @@ -1803,17 +2185,17 @@ ] }, { - "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "namespace": "maintenance", + "name": "ariadne", "type": "ClusterIP", "selector": { - "app": "mailu-sync-listener" + "app": "ariadne" }, "ports": [ { "name": "http", - "port": 8080, - "targetPort": 8080, + "port": 80, + "targetPort": "http", "protocol": "TCP" } ] @@ -1959,6 +2341,12 @@ "port": 5432, "targetPort": 5432, "protocol": "TCP" + }, + { + "name": "metrics", + "port": 9187, + "targetPort": 9187, + "protocol": "TCP" } ] }, @@ -2032,6 +2420,28 @@ } ] }, + { + "namespace": "traefik", + "name": "traefik", + "type": "LoadBalancer", + "selector": { + "app": "traefik" + }, + "ports": [ + { + "name": "web", + "port": 80, + "targetPort": "web", + "protocol": "TCP" + }, + { + "name": "websecure", + "port": 443, + "targetPort": "websecure", + "protocol": "TCP" + } + ] + }, { "namespace": "traefik", "name": "traefik-metrics", @@ -2210,6 +2620,26 @@ "source": "bstein-dev-home" } }, + { + "host": "budget.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "actual-budget", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "actual-budget" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "actual-budget", + "source": "finance" + } + }, { "host": "call.live.bstein.dev", "path": "/", @@ -2290,6 +2720,26 @@ "source": "nextcloud" } }, + { + "host": "health.bstein.dev", + "path": "/", + "backend": { + "namespace": "health", + "service": "wger", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "wger" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "wger", + "source": "health" + } + }, { "host": "kit.live.bstein.dev", "path": "/livekit/jwt", @@ -2385,6 +2835,106 @@ "source": "comms" } }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/r0/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/login", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/logout", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/refresh", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, { "host": "logs.bstein.dev", "path": "/", @@ -2650,6 +3200,26 @@ "source": "monerod" } }, + { + "host": "money.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "firefly", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "firefly" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "firefly", + "source": "finance" + } + }, { "host": "notes.bstein.dev", "path": "/", @@ -2838,7 +3408,6 @@ "matrix.live.bstein.dev" ], "comms:comms/othrys-synapse": [ - "bstein.dev", "kit.live.bstein.dev", "live.bstein.dev", "matrix.live.bstein.dev", @@ -2853,6 +3422,9 @@ "logging:logging/data-prepper": [ "registry.bstein.dev" ], + "longhorn:longhorn-system/longhorn": [ + "registry.bstein.dev" + ], "mailu:mailu-mailserver/mailu": [ "bstein.dev", "mail.bstein.dev" @@ -2862,8 +3434,12 @@ ], "monitoring:monitoring/grafana": [ "bstein.dev", + "mail.bstein.dev", "metrics.bstein.dev", "sso.bstein.dev" + ], + "monitoring:monitoring/kube-state-metrics": [ + "atlas.bstein.dev" ] } } diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml index 580a331..b3b0119 100644 --- a/knowledge/catalog/atlas.yaml +++ b/knowledge/catalog/atlas.yaml @@ -8,6 +8,15 @@ sources: - name: bstein-dev-home path: services/bstein-dev-home targetNamespace: bstein-dev-home +- name: bstein-dev-home-migrations + path: services/bstein-dev-home/migrations + targetNamespace: bstein-dev-home +- name: cert-manager + path: infrastructure/cert-manager + targetNamespace: cert-manager +- name: cert-manager-cleanup + path: infrastructure/cert-manager/cleanup + targetNamespace: cert-manager - name: comms path: services/comms targetNamespace: comms @@ -17,6 +26,9 @@ sources: - name: crypto path: services/crypto targetNamespace: crypto +- name: finance + path: services/finance + targetNamespace: finance - name: flux-system path: clusters/atlas/flux-system targetNamespace: null @@ -29,6 +41,9 @@ sources: - name: harbor path: services/harbor targetNamespace: harbor +- name: health + path: services/health + targetNamespace: health - name: helm path: infrastructure/sources/helm targetNamespace: flux-system @@ -44,6 +59,12 @@ sources: - name: logging path: services/logging targetNamespace: null +- name: longhorn + path: infrastructure/longhorn/core + targetNamespace: longhorn-system +- name: longhorn-adopt + path: infrastructure/longhorn/adopt + targetNamespace: longhorn-system - name: longhorn-ui path: infrastructure/longhorn/ui-ingress targetNamespace: longhorn-system @@ -98,9 +119,15 @@ sources: - name: vault-csi path: infrastructure/vault-csi targetNamespace: kube-system +- name: vault-injector + path: infrastructure/vault-injector + targetNamespace: vault - name: vaultwarden path: services/vaultwarden targetNamespace: vaultwarden +- name: wallet-monero-temp + path: services/crypto/wallet-monero-temp + targetNamespace: crypto - name: xmr-miner path: services/crypto/xmr-miner targetNamespace: crypto @@ -124,7 +151,7 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157 - kind: Deployment namespace: bstein-dev-home name: bstein-dev-home-frontend @@ -135,13 +162,22 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157 +- kind: Deployment + namespace: bstein-dev-home + name: bstein-dev-home-vault-sync + labels: + app: bstein-dev-home-vault-sync + serviceAccountName: bstein-dev-home-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: bstein-dev-home name: chat-ai-gateway labels: app: chat-ai-gateway - serviceAccountName: null + serviceAccountName: bstein-dev-home nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -157,12 +193,21 @@ workloads: hardware: rpi5 images: - python:3.11-slim +- kind: Deployment + namespace: comms + name: comms-vault-sync + labels: + app: comms-vault-sync + serviceAccountName: comms-vault + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: comms name: coturn labels: app: coturn - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -182,7 +227,7 @@ workloads: name: livekit labels: app: livekit - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -192,17 +237,17 @@ workloads: name: livekit-token-service labels: app: livekit-token-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: - - ghcr.io/element-hq/lk-jwt-service:0.3.0 + - registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0 - kind: Deployment namespace: comms name: matrix-authentication-service labels: app: matrix-authentication-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -212,7 +257,7 @@ workloads: name: matrix-guest-register labels: app.kubernetes.io/name: matrix-guest-register - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: {} images: - python:3.11-slim @@ -235,12 +280,21 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9 +- kind: Deployment + namespace: crypto + name: crypto-vault-sync + labels: + app: crypto-vault-sync + serviceAccountName: crypto-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: crypto name: monero-p2pool labels: app: monero-p2pool - serviceAccountName: null + serviceAccountName: crypto-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -255,6 +309,38 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - registry.bstein.dev/crypto/monerod:0.18.4.1 +- kind: Deployment + namespace: crypto + name: wallet-monero-temp + labels: + app: wallet-monero-temp + serviceAccountName: crypto-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1 +- kind: Deployment + namespace: finance + name: actual-budget + labels: + app: actual-budget + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d +- kind: Deployment + namespace: finance + name: firefly + labels: + app: firefly + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - fireflyiii/core:version-6.4.15 - kind: Deployment namespace: flux-system name: helm-controller @@ -344,17 +430,38 @@ workloads: name: gitea labels: app: gitea - serviceAccountName: null + serviceAccountName: gitea-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - gitea/gitea:1.23 +- kind: Deployment + namespace: harbor + name: harbor-vault-sync + labels: + app: harbor-vault-sync + serviceAccountName: harbor-vault-sync + nodeSelector: {} + images: + - alpine:3.20 +- kind: Deployment + namespace: health + name: wger + labels: + app: wger + serviceAccountName: health-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10 + - wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5 - kind: Deployment namespace: jellyfin name: jellyfin labels: app: jellyfin - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: {} images: - docker.io/jellyfin/jellyfin:10.11.5 @@ -363,13 +470,22 @@ workloads: name: pegasus labels: app: pegasus - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - alpine:3.20 - - registry.bstein.dev/streaming/pegasus:1.2.32 + - registry.bstein.dev/streaming/pegasus-vault:1.2.32 +- kind: Deployment + namespace: jellyfin + name: pegasus-vault-sync + labels: + app: pegasus-vault-sync + serviceAccountName: pegasus-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: jenkins name: jenkins @@ -381,6 +497,26 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - jenkins/jenkins:2.528.3-jdk21 +- kind: Deployment + namespace: jenkins + name: jenkins-vault-sync + labels: + app: jenkins-vault-sync + serviceAccountName: jenkins-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 +- kind: DaemonSet + namespace: kube-system + name: ntp-sync + labels: + app: ntp-sync + serviceAccountName: null + nodeSelector: {} + images: + - public.ecr.aws/docker/library/busybox:1.36.1 - kind: DaemonSet namespace: kube-system name: nvidia-device-plugin-jetson @@ -427,6 +563,16 @@ workloads: kubernetes.io/os: linux images: - hashicorp/vault-csi-provider:1.7.0 +- kind: Deployment + namespace: kube-system + name: coredns + labels: + k8s-app: kube-dns + serviceAccountName: coredns + nodeSelector: + kubernetes.io/os: linux + images: + - registry.bstein.dev/infra/coredns:1.12.1 - kind: DaemonSet namespace: logging name: node-image-gc-rpi4 @@ -457,22 +603,41 @@ workloads: hardware: rpi5 images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: logging + name: logging-vault-sync + labels: + app: logging-vault-sync + serviceAccountName: logging-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: logging name: oauth2-proxy-logs labels: app: oauth2-proxy-logs - serviceAccountName: null + serviceAccountName: logging-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: longhorn-system + name: longhorn-vault-sync + labels: + app: longhorn-vault-sync + serviceAccountName: longhorn-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 - kind: Deployment namespace: longhorn-system name: oauth2-proxy-longhorn labels: app: oauth2-proxy-longhorn - serviceAccountName: null + serviceAccountName: longhorn-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -489,13 +654,34 @@ workloads: - registry.bstein.dev/bstein/kubectl:1.35.0 - kind: Deployment namespace: mailu-mailserver - name: mailu-sync-listener + name: mailu-vault-sync labels: - app: mailu-sync-listener - serviceAccountName: null + app: mailu-vault-sync + serviceAccountName: mailu-vault-sync nodeSelector: {} images: - - python:3.11-alpine + - alpine:3.20 +- kind: DaemonSet + namespace: maintenance + name: disable-k3s-traefik + labels: + app: disable-k3s-traefik + serviceAccountName: disable-k3s-traefik + nodeSelector: + node-role.kubernetes.io/control-plane: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: DaemonSet + namespace: maintenance + name: k3s-agent-restart + labels: + app: k3s-agent-restart + serviceAccountName: node-nofile + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 - kind: DaemonSet namespace: maintenance name: node-image-sweeper @@ -515,6 +701,26 @@ workloads: nodeSelector: {} images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: maintenance + name: ariadne + labels: + app: ariadne + serviceAccountName: ariadne + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/bstein/ariadne:0.1.0-48 +- kind: Deployment + namespace: maintenance + name: maintenance-vault-sync + labels: + app: maintenance-vault-sync + serviceAccountName: maintenance-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: DaemonSet namespace: monitoring name: dcgm-exporter @@ -534,12 +740,21 @@ workloads: jetson: 'true' images: - python:3.10-slim +- kind: Deployment + namespace: monitoring + name: monitoring-vault-sync + labels: + app: monitoring-vault-sync + serviceAccountName: monitoring-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: monitoring name: postmark-exporter labels: app: postmark-exporter - serviceAccountName: null + serviceAccountName: monitoring-vault-sync nodeSelector: {} images: - python:3.12-alpine @@ -558,7 +773,7 @@ workloads: name: nextcloud labels: app: nextcloud - serviceAccountName: null + serviceAccountName: nextcloud-vault nodeSelector: hardware: rpi5 images: @@ -568,7 +783,7 @@ workloads: name: outline labels: app: outline - serviceAccountName: null + serviceAccountName: outline-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -588,7 +803,7 @@ workloads: name: planka labels: app: planka - serviceAccountName: null + serviceAccountName: planka-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -603,13 +818,16 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - postgres:15 + - quay.io/prometheuscommunity/postgres-exporter:v0.15.0 - kind: Deployment namespace: sso name: keycloak labels: app: keycloak - serviceAccountName: null - nodeSelector: {} + serviceAccountName: sso-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - quay.io/keycloak/keycloak:26.0.7 - kind: Deployment @@ -617,17 +835,26 @@ workloads: name: oauth2-proxy labels: app: oauth2-proxy - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: sso + name: sso-vault-sync + labels: + app: sso-vault-sync + serviceAccountName: sso-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: StatefulSet namespace: sso name: openldap labels: app: openldap - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -640,7 +867,7 @@ workloads: app: sui-metrics serviceAccountName: sui-metrics nodeSelector: - kubernetes.io/hostname: titan-24 + hardware: rpi5 images: - victoriametrics/vmagent:v1.103.0 - kind: Deployment @@ -648,6 +875,8 @@ workloads: name: traefik labels: app: traefik + app.kubernetes.io/instance: traefik-kube-system + app.kubernetes.io/name: traefik serviceAccountName: traefik-ingress-controller nodeSelector: node-role.kubernetes.io/worker: 'true' @@ -669,8 +898,10 @@ workloads: name: vaultwarden labels: app: vaultwarden - serviceAccountName: null - nodeSelector: {} + serviceAccountName: vaultwarden-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - vaultwarden/server:1.35.2 services: @@ -1040,6 +1271,36 @@ services: port: 3333 targetPort: 3333 protocol: TCP +- namespace: crypto + name: wallet-monero-temp + type: ClusterIP + selector: + app: wallet-monero-temp + ports: + - name: rpc + port: 18083 + targetPort: 18083 + protocol: TCP +- namespace: finance + name: actual-budget + type: ClusterIP + selector: + app: actual-budget + ports: + - name: http + port: 80 + targetPort: 5006 + protocol: TCP +- namespace: finance + name: firefly + type: ClusterIP + selector: + app: firefly + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP - namespace: flux-system name: notification-controller type: ClusterIP @@ -1082,7 +1343,7 @@ services: protocol: TCP - namespace: gitea name: gitea-ssh - type: NodePort + type: LoadBalancer selector: app: gitea ports: @@ -1090,6 +1351,16 @@ services: port: 2242 targetPort: 2242 protocol: TCP +- namespace: health + name: wger + type: ClusterIP + selector: + app: wger + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP - namespace: jellyfin name: jellyfin type: ClusterIP @@ -1124,21 +1395,6 @@ services: port: 50000 targetPort: 50000 protocol: TCP -- namespace: kube-system - name: traefik - type: LoadBalancer - selector: - app.kubernetes.io/instance: traefik-kube-system - app.kubernetes.io/name: traefik - ports: - - name: web - port: 80 - targetPort: web - protocol: TCP - - name: websecure - port: 443 - targetPort: websecure - protocol: TCP - namespace: logging name: oauth2-proxy-logs type: ClusterIP @@ -1191,15 +1447,15 @@ services: port: 4190 targetPort: 4190 protocol: TCP -- namespace: mailu-mailserver - name: mailu-sync-listener +- namespace: maintenance + name: ariadne type: ClusterIP selector: - app: mailu-sync-listener + app: ariadne ports: - name: http - port: 8080 - targetPort: 8080 + port: 80 + targetPort: http protocol: TCP - namespace: monitoring name: dcgm-exporter @@ -1291,6 +1547,10 @@ services: port: 5432 targetPort: 5432 protocol: TCP + - name: metrics + port: 9187 + targetPort: 9187 + protocol: TCP - namespace: sso name: keycloak type: ClusterIP @@ -1335,6 +1595,20 @@ services: port: 8429 targetPort: 8429 protocol: TCP +- namespace: traefik + name: traefik + type: LoadBalancer + selector: + app: traefik + ports: + - name: web + port: 80 + targetPort: web + protocol: TCP + - name: websecure + port: 443 + targetPort: websecure + protocol: TCP - namespace: traefik name: traefik-metrics type: ClusterIP @@ -1447,6 +1721,19 @@ http_endpoints: kind: Ingress name: bstein-dev-home source: bstein-dev-home +- host: budget.bstein.dev + path: / + backend: + namespace: finance + service: actual-budget + port: 80 + workloads: + - kind: Deployment + name: actual-budget + via: + kind: Ingress + name: actual-budget + source: finance - host: call.live.bstein.dev path: / backend: @@ -1499,6 +1786,19 @@ http_endpoints: kind: Ingress name: nextcloud source: nextcloud +- host: health.bstein.dev + path: / + backend: + namespace: health + service: wger + port: 80 + workloads: + - kind: Deployment + name: wger + via: + kind: Ingress + name: wger + source: health - host: kit.live.bstein.dev path: /livekit/jwt backend: @@ -1558,6 +1858,65 @@ http_endpoints: kind: Ingress name: matrix-routing source: comms +- host: live.bstein.dev + path: /_matrix/client/r0/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: &id003 + - kind: Deployment + name: matrix-guest-register + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/login + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: &id002 + - kind: Deployment + name: matrix-authentication-service + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/logout + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/refresh + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: comms - host: logs.bstein.dev path: / backend: @@ -1601,9 +1960,7 @@ http_endpoints: namespace: comms service: matrix-authentication-service port: 8080 - workloads: &id002 - - kind: Deployment - name: matrix-authentication-service + workloads: *id002 via: kind: Ingress name: matrix-routing @@ -1647,9 +2004,7 @@ http_endpoints: namespace: comms service: matrix-guest-register port: 8080 - workloads: &id003 - - kind: Deployment - name: matrix-guest-register + workloads: *id003 via: kind: Ingress name: matrix-routing @@ -1722,6 +2077,19 @@ http_endpoints: kind: Ingress name: monerod source: monerod +- host: money.bstein.dev + path: / + backend: + namespace: finance + service: firefly + port: 80 + workloads: + - kind: Deployment + name: firefly + via: + kind: Ingress + name: firefly + source: finance - host: notes.bstein.dev path: / backend: @@ -1845,7 +2213,6 @@ helmrelease_host_hints: - live.bstein.dev - matrix.live.bstein.dev comms:comms/othrys-synapse: - - bstein.dev - kit.live.bstein.dev - live.bstein.dev - matrix.live.bstein.dev @@ -1856,6 +2223,8 @@ helmrelease_host_hints: - registry.bstein.dev logging:logging/data-prepper: - registry.bstein.dev + longhorn:longhorn-system/longhorn: + - registry.bstein.dev mailu:mailu-mailserver/mailu: - bstein.dev - mail.bstein.dev @@ -1863,5 +2232,8 @@ helmrelease_host_hints: - alerts.bstein.dev monitoring:monitoring/grafana: - bstein.dev + - mail.bstein.dev - metrics.bstein.dev - sso.bstein.dev + monitoring:monitoring/kube-state-metrics: + - atlas.bstein.dev diff --git a/knowledge/diagrams/atlas-http.mmd b/knowledge/diagrams/atlas-http.mmd index ab7c362..1aa7ac8 100644 --- a/knowledge/diagrams/atlas-http.mmd +++ b/knowledge/diagrams/atlas-http.mmd @@ -17,6 +17,11 @@ flowchart LR host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"] svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend + host_budget_bstein_dev["budget.bstein.dev"] + svc_finance_actual_budget["finance/actual-budget (Service)"] + host_budget_bstein_dev --> svc_finance_actual_budget + wl_finance_actual_budget["finance/actual-budget (Deployment)"] + svc_finance_actual_budget --> wl_finance_actual_budget host_call_live_bstein_dev["call.live.bstein.dev"] svc_comms_element_call["comms/element-call (Service)"] host_call_live_bstein_dev --> svc_comms_element_call @@ -37,6 +42,11 @@ flowchart LR host_cloud_bstein_dev --> svc_nextcloud_nextcloud wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"] svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud + host_health_bstein_dev["health.bstein.dev"] + svc_health_wger["health/wger (Service)"] + host_health_bstein_dev --> svc_health_wger + wl_health_wger["health/wger (Deployment)"] + svc_health_wger --> wl_health_wger host_kit_live_bstein_dev["kit.live.bstein.dev"] svc_comms_livekit_token_service["comms/livekit-token-service (Service)"] host_kit_live_bstein_dev --> svc_comms_livekit_token_service @@ -50,6 +60,14 @@ flowchart LR host_live_bstein_dev --> svc_comms_matrix_wellknown svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"] host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] + host_live_bstein_dev --> svc_comms_matrix_guest_register + wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] + svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] + host_live_bstein_dev --> svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] + svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_logs_bstein_dev["logs.bstein.dev"] svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"] host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs @@ -64,21 +82,20 @@ flowchart LR svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"] host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front host_matrix_live_bstein_dev["matrix.live.bstein.dev"] - svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] - svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register - wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] - svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register host_monero_bstein_dev["monero.bstein.dev"] svc_crypto_monerod["crypto/monerod (Service)"] host_monero_bstein_dev --> svc_crypto_monerod wl_crypto_monerod["crypto/monerod (Deployment)"] svc_crypto_monerod --> wl_crypto_monerod + host_money_bstein_dev["money.bstein.dev"] + svc_finance_firefly["finance/firefly (Service)"] + host_money_bstein_dev --> svc_finance_firefly + wl_finance_firefly["finance/firefly (Deployment)"] + svc_finance_firefly --> wl_finance_firefly host_notes_bstein_dev["notes.bstein.dev"] svc_outline_outline["outline/outline (Service)"] host_notes_bstein_dev --> svc_outline_outline @@ -143,19 +160,29 @@ flowchart LR svc_comms_livekit wl_comms_livekit svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service svc_comms_matrix_guest_register wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service end subgraph crypto[crypto] svc_crypto_monerod wl_crypto_monerod end + subgraph finance[finance] + svc_finance_actual_budget + wl_finance_actual_budget + svc_finance_firefly + wl_finance_firefly + end subgraph gitea[gitea] svc_gitea_gitea wl_gitea_gitea end + subgraph health[health] + svc_health_wger + wl_health_wger + end subgraph jellyfin[jellyfin] svc_jellyfin_pegasus wl_jellyfin_pegasus diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py index c7f9f26..34938e7 100644 --- a/scripts/knowledge_render_atlas.py +++ b/scripts/knowledge_render_atlas.py @@ -20,6 +20,7 @@ import subprocess import sys from dataclasses import dataclass from pathlib import Path +import shutil from typing import Any, Iterable import yaml @@ -60,6 +61,12 @@ def _run(cmd: list[str], *, cwd: Path) -> str: return res.stdout +def _sync_tree(source: Path, dest: Path) -> None: + if dest.exists(): + shutil.rmtree(dest) + shutil.copytree(source, dest) + + def kustomize_build(path: Path) -> str: rel = path.relative_to(REPO_ROOT) try: @@ -472,6 +479,11 @@ def main() -> int: action="store_true", help="Write generated files (otherwise just print a summary).", ) + ap.add_argument( + "--sync-comms", + action="store_true", + help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.", + ) args = ap.parse_args() out_dir = REPO_ROOT / args.out @@ -549,6 +561,11 @@ def main() -> int: print(f"Wrote {summary_path.relative_to(REPO_ROOT)}") print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}") print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}") + + if args.sync_comms: + comms_dir = REPO_ROOT / "services" / "comms" / "knowledge" + _sync_tree(out_dir, comms_dir) + print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}") return 0 diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7aedf4a..70844eb 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-8 + checksum/atlasbot-configmap: manual-atlasbot-9 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -73,6 +73,8 @@ spec: value: /kb - name: VM_URL value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 + - name: ARIADNE_STATE_URL + value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state - name: BOT_USER value: atlasbot - name: BOT_MENTIONS diff --git a/services/comms/knowledge/catalog/atlas-summary.json b/services/comms/knowledge/catalog/atlas-summary.json index fa35051..ea825ce 100644 --- a/services/comms/knowledge/catalog/atlas-summary.json +++ b/services/comms/knowledge/catalog/atlas-summary.json @@ -1,8 +1,8 @@ { "counts": { - "helmrelease_host_hints": 17, - "http_endpoints": 37, - "services": 43, - "workloads": 54 + "helmrelease_host_hints": 19, + "http_endpoints": 45, + "services": 47, + "workloads": 74 } } diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json index 18cb6b6..21ac407 100644 --- a/services/comms/knowledge/catalog/atlas.json +++ b/services/comms/knowledge/catalog/atlas.json @@ -11,6 +11,21 @@ "path": "services/bstein-dev-home", "targetNamespace": "bstein-dev-home" }, + { + "name": "bstein-dev-home-migrations", + "path": "services/bstein-dev-home/migrations", + "targetNamespace": "bstein-dev-home" + }, + { + "name": "cert-manager", + "path": "infrastructure/cert-manager", + "targetNamespace": "cert-manager" + }, + { + "name": "cert-manager-cleanup", + "path": "infrastructure/cert-manager/cleanup", + "targetNamespace": "cert-manager" + }, { "name": "comms", "path": "services/comms", @@ -26,6 +41,11 @@ "path": "services/crypto", "targetNamespace": "crypto" }, + { + "name": "finance", + "path": "services/finance", + "targetNamespace": "finance" + }, { "name": "flux-system", "path": "clusters/atlas/flux-system", @@ -46,6 +66,11 @@ "path": "services/harbor", "targetNamespace": "harbor" }, + { + "name": "health", + "path": "services/health", + "targetNamespace": "health" + }, { "name": "helm", "path": "infrastructure/sources/helm", @@ -71,6 +96,16 @@ "path": "services/logging", "targetNamespace": null }, + { + "name": "longhorn", + "path": "infrastructure/longhorn/core", + "targetNamespace": "longhorn-system" + }, + { + "name": "longhorn-adopt", + "path": "infrastructure/longhorn/adopt", + "targetNamespace": "longhorn-system" + }, { "name": "longhorn-ui", "path": "infrastructure/longhorn/ui-ingress", @@ -161,11 +196,21 @@ "path": "infrastructure/vault-csi", "targetNamespace": "kube-system" }, + { + "name": "vault-injector", + "path": "infrastructure/vault-injector", + "targetNamespace": "vault" + }, { "name": "vaultwarden", "path": "services/vaultwarden", "targetNamespace": "vaultwarden" }, + { + "name": "wallet-monero-temp", + "path": "services/crypto/wallet-monero-temp", + "targetNamespace": "crypto" + }, { "name": "xmr-miner", "path": "services/crypto/xmr-miner", @@ -199,7 +244,7 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157" ] }, { @@ -215,7 +260,20 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-vault-sync", + "labels": { + "app": "bstein-dev-home-vault-sync" + }, + "serviceAccountName": "bstein-dev-home-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -225,7 +283,7 @@ "labels": { "app": "chat-ai-gateway" }, - "serviceAccountName": null, + "serviceAccountName": "bstein-dev-home", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -249,6 +307,19 @@ "python:3.11-slim" ] }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "comms-vault-sync", + "labels": { + "app": "comms-vault-sync" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "comms", @@ -256,7 +327,7 @@ "labels": { "app": "coturn" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -286,7 +357,7 @@ "labels": { "app": "livekit" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -301,12 +372,12 @@ "labels": { "app": "livekit-token-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, "images": [ - "ghcr.io/element-hq/lk-jwt-service:0.3.0" + "registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0" ] }, { @@ -316,7 +387,7 @@ "labels": { "app": "matrix-authentication-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -331,7 +402,7 @@ "labels": { "app.kubernetes.io/name": "matrix-guest-register" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": {}, "images": [ "python:3.11-slim" @@ -365,6 +436,19 @@ "ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "crypto-vault-sync", + "labels": { + "app": "crypto-vault-sync" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "crypto", @@ -372,7 +456,7 @@ "labels": { "app": "monero-p2pool" }, - "serviceAccountName": null, + "serviceAccountName": "crypto-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -395,6 +479,53 @@ "registry.bstein.dev/crypto/monerod:0.18.4.1" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "wallet-monero-temp", + "labels": { + "app": "wallet-monero-temp" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "actual-budget", + "labels": { + "app": "actual-budget" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "firefly", + "labels": { + "app": "firefly" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "fireflyiii/core:version-6.4.15" + ] + }, { "kind": "Deployment", "namespace": "flux-system", @@ -516,7 +647,7 @@ "labels": { "app": "gitea" }, - "serviceAccountName": null, + "serviceAccountName": "gitea-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -524,6 +655,36 @@ "gitea/gitea:1.23" ] }, + { + "kind": "Deployment", + "namespace": "harbor", + "name": "harbor-vault-sync", + "labels": { + "app": "harbor-vault-sync" + }, + "serviceAccountName": "harbor-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "health", + "name": "wger", + "labels": { + "app": "wger" + }, + "serviceAccountName": "health-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10", + "wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5" + ] + }, { "kind": "Deployment", "namespace": "jellyfin", @@ -531,7 +692,7 @@ "labels": { "app": "jellyfin" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": {}, "images": [ "docker.io/jellyfin/jellyfin:10.11.5" @@ -544,14 +705,27 @@ "labels": { "app": "pegasus" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" }, "images": [ "alpine:3.20", - "registry.bstein.dev/streaming/pegasus:1.2.32" + "registry.bstein.dev/streaming/pegasus-vault:1.2.32" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "pegasus-vault-sync", + "labels": { + "app": "pegasus-vault-sync" + }, + "serviceAccountName": "pegasus-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -570,6 +744,35 @@ "jenkins/jenkins:2.528.3-jdk21" ] }, + { + "kind": "Deployment", + "namespace": "jenkins", + "name": "jenkins-vault-sync", + "labels": { + "app": "jenkins-vault-sync" + }, + "serviceAccountName": "jenkins-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "ntp-sync", + "labels": { + "app": "ntp-sync" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "public.ecr.aws/docker/library/busybox:1.36.1" + ] + }, { "kind": "DaemonSet", "namespace": "kube-system", @@ -636,6 +839,21 @@ "hashicorp/vault-csi-provider:1.7.0" ] }, + { + "kind": "Deployment", + "namespace": "kube-system", + "name": "coredns", + "labels": { + "k8s-app": "kube-dns" + }, + "serviceAccountName": "coredns", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "registry.bstein.dev/infra/coredns:1.12.1" + ] + }, { "kind": "DaemonSet", "namespace": "logging", @@ -681,6 +899,19 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "logging", + "name": "logging-vault-sync", + "labels": { + "app": "logging-vault-sync" + }, + "serviceAccountName": "logging-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "logging", @@ -688,12 +919,27 @@ "labels": { "app": "oauth2-proxy-logs" }, - "serviceAccountName": null, + "serviceAccountName": "logging-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "longhorn-system", + "name": "longhorn-vault-sync", + "labels": { + "app": "longhorn-vault-sync" + }, + "serviceAccountName": "longhorn-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" ] }, { @@ -703,7 +949,7 @@ "labels": { "app": "oauth2-proxy-longhorn" }, - "serviceAccountName": null, + "serviceAccountName": "longhorn-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -729,14 +975,45 @@ { "kind": "Deployment", "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "name": "mailu-vault-sync", "labels": { - "app": "mailu-sync-listener" + "app": "mailu-vault-sync" }, - "serviceAccountName": null, + "serviceAccountName": "mailu-vault-sync", "nodeSelector": {}, "images": [ - "python:3.11-alpine" + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "disable-k3s-traefik", + "labels": { + "app": "disable-k3s-traefik" + }, + "serviceAccountName": "disable-k3s-traefik", + "nodeSelector": { + "node-role.kubernetes.io/control-plane": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "k3s-agent-restart", + "labels": { + "app": "k3s-agent-restart" + }, + "serviceAccountName": "node-nofile", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, { @@ -767,6 +1044,35 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "ariadne", + "labels": { + "app": "ariadne" + }, + "serviceAccountName": "ariadne", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/ariadne:0.1.0-48" + ] + }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "maintenance-vault-sync", + "labels": { + "app": "maintenance-vault-sync" + }, + "serviceAccountName": "maintenance-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "DaemonSet", "namespace": "monitoring", @@ -795,6 +1101,19 @@ "python:3.10-slim" ] }, + { + "kind": "Deployment", + "namespace": "monitoring", + "name": "monitoring-vault-sync", + "labels": { + "app": "monitoring-vault-sync" + }, + "serviceAccountName": "monitoring-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "monitoring", @@ -802,7 +1121,7 @@ "labels": { "app": "postmark-exporter" }, - "serviceAccountName": null, + "serviceAccountName": "monitoring-vault-sync", "nodeSelector": {}, "images": [ "python:3.12-alpine" @@ -830,7 +1149,7 @@ "labels": { "app": "nextcloud" }, - "serviceAccountName": null, + "serviceAccountName": "nextcloud-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -845,7 +1164,7 @@ "labels": { "app": "outline" }, - "serviceAccountName": null, + "serviceAccountName": "outline-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -875,7 +1194,7 @@ "labels": { "app": "planka" }, - "serviceAccountName": null, + "serviceAccountName": "planka-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -895,7 +1214,8 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "postgres:15" + "postgres:15", + "quay.io/prometheuscommunity/postgres-exporter:v0.15.0" ] }, { @@ -905,8 +1225,11 @@ "labels": { "app": "keycloak" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "sso-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ "quay.io/keycloak/keycloak:26.0.7" ] @@ -918,12 +1241,25 @@ "labels": { "app": "oauth2-proxy" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "sso-vault-sync", + "labels": { + "app": "sso-vault-sync" + }, + "serviceAccountName": "sso-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -933,7 +1269,7 @@ "labels": { "app": "openldap" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -951,7 +1287,7 @@ }, "serviceAccountName": "sui-metrics", "nodeSelector": { - "kubernetes.io/hostname": "titan-24" + "hardware": "rpi5" }, "images": [ "victoriametrics/vmagent:v1.103.0" @@ -962,7 +1298,9 @@ "namespace": "traefik", "name": "traefik", "labels": { - "app": "traefik" + "app": "traefik", + "app.kubernetes.io/instance": "traefik-kube-system", + "app.kubernetes.io/name": "traefik" }, "serviceAccountName": "traefik-ingress-controller", "nodeSelector": { @@ -995,8 +1333,11 @@ "labels": { "app": "vaultwarden" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "vaultwarden-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ "vaultwarden/server:1.35.2" ] @@ -1565,6 +1906,54 @@ } ] }, + { + "namespace": "crypto", + "name": "wallet-monero-temp", + "type": "ClusterIP", + "selector": { + "app": "wallet-monero-temp" + }, + "ports": [ + { + "name": "rpc", + "port": 18083, + "targetPort": 18083, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "actual-budget", + "type": "ClusterIP", + "selector": { + "app": "actual-budget" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 5006, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "firefly", + "type": "ClusterIP", + "selector": { + "app": "firefly" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, { "namespace": "flux-system", "name": "notification-controller", @@ -1632,7 +2021,7 @@ { "namespace": "gitea", "name": "gitea-ssh", - "type": "NodePort", + "type": "LoadBalancer", "selector": { "app": "gitea" }, @@ -1645,6 +2034,22 @@ } ] }, + { + "namespace": "health", + "name": "wger", + "type": "ClusterIP", + "selector": { + "app": "wger" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, { "namespace": "jellyfin", "name": "jellyfin", @@ -1699,29 +2104,6 @@ } ] }, - { - "namespace": "kube-system", - "name": "traefik", - "type": "LoadBalancer", - "selector": { - "app.kubernetes.io/instance": "traefik-kube-system", - "app.kubernetes.io/name": "traefik" - }, - "ports": [ - { - "name": "web", - "port": 80, - "targetPort": "web", - "protocol": "TCP" - }, - { - "name": "websecure", - "port": 443, - "targetPort": "websecure", - "protocol": "TCP" - } - ] - }, { "namespace": "logging", "name": "oauth2-proxy-logs", @@ -1803,17 +2185,17 @@ ] }, { - "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "namespace": "maintenance", + "name": "ariadne", "type": "ClusterIP", "selector": { - "app": "mailu-sync-listener" + "app": "ariadne" }, "ports": [ { "name": "http", - "port": 8080, - "targetPort": 8080, + "port": 80, + "targetPort": "http", "protocol": "TCP" } ] @@ -1959,6 +2341,12 @@ "port": 5432, "targetPort": 5432, "protocol": "TCP" + }, + { + "name": "metrics", + "port": 9187, + "targetPort": 9187, + "protocol": "TCP" } ] }, @@ -2032,6 +2420,28 @@ } ] }, + { + "namespace": "traefik", + "name": "traefik", + "type": "LoadBalancer", + "selector": { + "app": "traefik" + }, + "ports": [ + { + "name": "web", + "port": 80, + "targetPort": "web", + "protocol": "TCP" + }, + { + "name": "websecure", + "port": 443, + "targetPort": "websecure", + "protocol": "TCP" + } + ] + }, { "namespace": "traefik", "name": "traefik-metrics", @@ -2210,6 +2620,26 @@ "source": "bstein-dev-home" } }, + { + "host": "budget.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "actual-budget", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "actual-budget" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "actual-budget", + "source": "finance" + } + }, { "host": "call.live.bstein.dev", "path": "/", @@ -2290,6 +2720,26 @@ "source": "nextcloud" } }, + { + "host": "health.bstein.dev", + "path": "/", + "backend": { + "namespace": "health", + "service": "wger", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "wger" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "wger", + "source": "health" + } + }, { "host": "kit.live.bstein.dev", "path": "/livekit/jwt", @@ -2385,6 +2835,106 @@ "source": "comms" } }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/r0/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/login", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/logout", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/refresh", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, { "host": "logs.bstein.dev", "path": "/", @@ -2650,6 +3200,26 @@ "source": "monerod" } }, + { + "host": "money.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "firefly", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "firefly" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "firefly", + "source": "finance" + } + }, { "host": "notes.bstein.dev", "path": "/", @@ -2838,7 +3408,6 @@ "matrix.live.bstein.dev" ], "comms:comms/othrys-synapse": [ - "bstein.dev", "kit.live.bstein.dev", "live.bstein.dev", "matrix.live.bstein.dev", @@ -2853,6 +3422,9 @@ "logging:logging/data-prepper": [ "registry.bstein.dev" ], + "longhorn:longhorn-system/longhorn": [ + "registry.bstein.dev" + ], "mailu:mailu-mailserver/mailu": [ "bstein.dev", "mail.bstein.dev" @@ -2862,8 +3434,12 @@ ], "monitoring:monitoring/grafana": [ "bstein.dev", + "mail.bstein.dev", "metrics.bstein.dev", "sso.bstein.dev" + ], + "monitoring:monitoring/kube-state-metrics": [ + "atlas.bstein.dev" ] } } diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml index 67f2fcb..b3b0119 100644 --- a/services/comms/knowledge/catalog/atlas.yaml +++ b/services/comms/knowledge/catalog/atlas.yaml @@ -1,4 +1,4 @@ -# services/comms/knowledge/catalog/atlas.yaml +# knowledge/catalog/atlas.yaml # Generated by scripts/knowledge_render_atlas.py (do not edit by hand) cluster: atlas sources: @@ -8,6 +8,15 @@ sources: - name: bstein-dev-home path: services/bstein-dev-home targetNamespace: bstein-dev-home +- name: bstein-dev-home-migrations + path: services/bstein-dev-home/migrations + targetNamespace: bstein-dev-home +- name: cert-manager + path: infrastructure/cert-manager + targetNamespace: cert-manager +- name: cert-manager-cleanup + path: infrastructure/cert-manager/cleanup + targetNamespace: cert-manager - name: comms path: services/comms targetNamespace: comms @@ -17,6 +26,9 @@ sources: - name: crypto path: services/crypto targetNamespace: crypto +- name: finance + path: services/finance + targetNamespace: finance - name: flux-system path: clusters/atlas/flux-system targetNamespace: null @@ -29,6 +41,9 @@ sources: - name: harbor path: services/harbor targetNamespace: harbor +- name: health + path: services/health + targetNamespace: health - name: helm path: infrastructure/sources/helm targetNamespace: flux-system @@ -44,6 +59,12 @@ sources: - name: logging path: services/logging targetNamespace: null +- name: longhorn + path: infrastructure/longhorn/core + targetNamespace: longhorn-system +- name: longhorn-adopt + path: infrastructure/longhorn/adopt + targetNamespace: longhorn-system - name: longhorn-ui path: infrastructure/longhorn/ui-ingress targetNamespace: longhorn-system @@ -98,9 +119,15 @@ sources: - name: vault-csi path: infrastructure/vault-csi targetNamespace: kube-system +- name: vault-injector + path: infrastructure/vault-injector + targetNamespace: vault - name: vaultwarden path: services/vaultwarden targetNamespace: vaultwarden +- name: wallet-monero-temp + path: services/crypto/wallet-monero-temp + targetNamespace: crypto - name: xmr-miner path: services/crypto/xmr-miner targetNamespace: crypto @@ -124,7 +151,7 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157 - kind: Deployment namespace: bstein-dev-home name: bstein-dev-home-frontend @@ -135,13 +162,22 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157 +- kind: Deployment + namespace: bstein-dev-home + name: bstein-dev-home-vault-sync + labels: + app: bstein-dev-home-vault-sync + serviceAccountName: bstein-dev-home-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: bstein-dev-home name: chat-ai-gateway labels: app: chat-ai-gateway - serviceAccountName: null + serviceAccountName: bstein-dev-home nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -157,12 +193,21 @@ workloads: hardware: rpi5 images: - python:3.11-slim +- kind: Deployment + namespace: comms + name: comms-vault-sync + labels: + app: comms-vault-sync + serviceAccountName: comms-vault + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: comms name: coturn labels: app: coturn - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -182,7 +227,7 @@ workloads: name: livekit labels: app: livekit - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -192,17 +237,17 @@ workloads: name: livekit-token-service labels: app: livekit-token-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: - - ghcr.io/element-hq/lk-jwt-service:0.3.0 + - registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0 - kind: Deployment namespace: comms name: matrix-authentication-service labels: app: matrix-authentication-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -212,7 +257,7 @@ workloads: name: matrix-guest-register labels: app.kubernetes.io/name: matrix-guest-register - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: {} images: - python:3.11-slim @@ -235,12 +280,21 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9 +- kind: Deployment + namespace: crypto + name: crypto-vault-sync + labels: + app: crypto-vault-sync + serviceAccountName: crypto-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: crypto name: monero-p2pool labels: app: monero-p2pool - serviceAccountName: null + serviceAccountName: crypto-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -255,6 +309,38 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - registry.bstein.dev/crypto/monerod:0.18.4.1 +- kind: Deployment + namespace: crypto + name: wallet-monero-temp + labels: + app: wallet-monero-temp + serviceAccountName: crypto-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1 +- kind: Deployment + namespace: finance + name: actual-budget + labels: + app: actual-budget + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d +- kind: Deployment + namespace: finance + name: firefly + labels: + app: firefly + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - fireflyiii/core:version-6.4.15 - kind: Deployment namespace: flux-system name: helm-controller @@ -344,17 +430,38 @@ workloads: name: gitea labels: app: gitea - serviceAccountName: null + serviceAccountName: gitea-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - gitea/gitea:1.23 +- kind: Deployment + namespace: harbor + name: harbor-vault-sync + labels: + app: harbor-vault-sync + serviceAccountName: harbor-vault-sync + nodeSelector: {} + images: + - alpine:3.20 +- kind: Deployment + namespace: health + name: wger + labels: + app: wger + serviceAccountName: health-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10 + - wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5 - kind: Deployment namespace: jellyfin name: jellyfin labels: app: jellyfin - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: {} images: - docker.io/jellyfin/jellyfin:10.11.5 @@ -363,13 +470,22 @@ workloads: name: pegasus labels: app: pegasus - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - alpine:3.20 - - registry.bstein.dev/streaming/pegasus:1.2.32 + - registry.bstein.dev/streaming/pegasus-vault:1.2.32 +- kind: Deployment + namespace: jellyfin + name: pegasus-vault-sync + labels: + app: pegasus-vault-sync + serviceAccountName: pegasus-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: jenkins name: jenkins @@ -381,6 +497,26 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - jenkins/jenkins:2.528.3-jdk21 +- kind: Deployment + namespace: jenkins + name: jenkins-vault-sync + labels: + app: jenkins-vault-sync + serviceAccountName: jenkins-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 +- kind: DaemonSet + namespace: kube-system + name: ntp-sync + labels: + app: ntp-sync + serviceAccountName: null + nodeSelector: {} + images: + - public.ecr.aws/docker/library/busybox:1.36.1 - kind: DaemonSet namespace: kube-system name: nvidia-device-plugin-jetson @@ -427,6 +563,16 @@ workloads: kubernetes.io/os: linux images: - hashicorp/vault-csi-provider:1.7.0 +- kind: Deployment + namespace: kube-system + name: coredns + labels: + k8s-app: kube-dns + serviceAccountName: coredns + nodeSelector: + kubernetes.io/os: linux + images: + - registry.bstein.dev/infra/coredns:1.12.1 - kind: DaemonSet namespace: logging name: node-image-gc-rpi4 @@ -457,22 +603,41 @@ workloads: hardware: rpi5 images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: logging + name: logging-vault-sync + labels: + app: logging-vault-sync + serviceAccountName: logging-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: logging name: oauth2-proxy-logs labels: app: oauth2-proxy-logs - serviceAccountName: null + serviceAccountName: logging-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: longhorn-system + name: longhorn-vault-sync + labels: + app: longhorn-vault-sync + serviceAccountName: longhorn-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 - kind: Deployment namespace: longhorn-system name: oauth2-proxy-longhorn labels: app: oauth2-proxy-longhorn - serviceAccountName: null + serviceAccountName: longhorn-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -489,13 +654,34 @@ workloads: - registry.bstein.dev/bstein/kubectl:1.35.0 - kind: Deployment namespace: mailu-mailserver - name: mailu-sync-listener + name: mailu-vault-sync labels: - app: mailu-sync-listener - serviceAccountName: null + app: mailu-vault-sync + serviceAccountName: mailu-vault-sync nodeSelector: {} images: - - python:3.11-alpine + - alpine:3.20 +- kind: DaemonSet + namespace: maintenance + name: disable-k3s-traefik + labels: + app: disable-k3s-traefik + serviceAccountName: disable-k3s-traefik + nodeSelector: + node-role.kubernetes.io/control-plane: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: DaemonSet + namespace: maintenance + name: k3s-agent-restart + labels: + app: k3s-agent-restart + serviceAccountName: node-nofile + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 - kind: DaemonSet namespace: maintenance name: node-image-sweeper @@ -515,6 +701,26 @@ workloads: nodeSelector: {} images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: maintenance + name: ariadne + labels: + app: ariadne + serviceAccountName: ariadne + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/bstein/ariadne:0.1.0-48 +- kind: Deployment + namespace: maintenance + name: maintenance-vault-sync + labels: + app: maintenance-vault-sync + serviceAccountName: maintenance-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: DaemonSet namespace: monitoring name: dcgm-exporter @@ -534,12 +740,21 @@ workloads: jetson: 'true' images: - python:3.10-slim +- kind: Deployment + namespace: monitoring + name: monitoring-vault-sync + labels: + app: monitoring-vault-sync + serviceAccountName: monitoring-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: monitoring name: postmark-exporter labels: app: postmark-exporter - serviceAccountName: null + serviceAccountName: monitoring-vault-sync nodeSelector: {} images: - python:3.12-alpine @@ -558,7 +773,7 @@ workloads: name: nextcloud labels: app: nextcloud - serviceAccountName: null + serviceAccountName: nextcloud-vault nodeSelector: hardware: rpi5 images: @@ -568,7 +783,7 @@ workloads: name: outline labels: app: outline - serviceAccountName: null + serviceAccountName: outline-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -588,7 +803,7 @@ workloads: name: planka labels: app: planka - serviceAccountName: null + serviceAccountName: planka-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -603,13 +818,16 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - postgres:15 + - quay.io/prometheuscommunity/postgres-exporter:v0.15.0 - kind: Deployment namespace: sso name: keycloak labels: app: keycloak - serviceAccountName: null - nodeSelector: {} + serviceAccountName: sso-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - quay.io/keycloak/keycloak:26.0.7 - kind: Deployment @@ -617,17 +835,26 @@ workloads: name: oauth2-proxy labels: app: oauth2-proxy - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: sso + name: sso-vault-sync + labels: + app: sso-vault-sync + serviceAccountName: sso-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: StatefulSet namespace: sso name: openldap labels: app: openldap - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -640,7 +867,7 @@ workloads: app: sui-metrics serviceAccountName: sui-metrics nodeSelector: - kubernetes.io/hostname: titan-24 + hardware: rpi5 images: - victoriametrics/vmagent:v1.103.0 - kind: Deployment @@ -648,6 +875,8 @@ workloads: name: traefik labels: app: traefik + app.kubernetes.io/instance: traefik-kube-system + app.kubernetes.io/name: traefik serviceAccountName: traefik-ingress-controller nodeSelector: node-role.kubernetes.io/worker: 'true' @@ -669,8 +898,10 @@ workloads: name: vaultwarden labels: app: vaultwarden - serviceAccountName: null - nodeSelector: {} + serviceAccountName: vaultwarden-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - vaultwarden/server:1.35.2 services: @@ -1040,6 +1271,36 @@ services: port: 3333 targetPort: 3333 protocol: TCP +- namespace: crypto + name: wallet-monero-temp + type: ClusterIP + selector: + app: wallet-monero-temp + ports: + - name: rpc + port: 18083 + targetPort: 18083 + protocol: TCP +- namespace: finance + name: actual-budget + type: ClusterIP + selector: + app: actual-budget + ports: + - name: http + port: 80 + targetPort: 5006 + protocol: TCP +- namespace: finance + name: firefly + type: ClusterIP + selector: + app: firefly + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP - namespace: flux-system name: notification-controller type: ClusterIP @@ -1082,7 +1343,7 @@ services: protocol: TCP - namespace: gitea name: gitea-ssh - type: NodePort + type: LoadBalancer selector: app: gitea ports: @@ -1090,6 +1351,16 @@ services: port: 2242 targetPort: 2242 protocol: TCP +- namespace: health + name: wger + type: ClusterIP + selector: + app: wger + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP - namespace: jellyfin name: jellyfin type: ClusterIP @@ -1124,21 +1395,6 @@ services: port: 50000 targetPort: 50000 protocol: TCP -- namespace: kube-system - name: traefik - type: LoadBalancer - selector: - app.kubernetes.io/instance: traefik-kube-system - app.kubernetes.io/name: traefik - ports: - - name: web - port: 80 - targetPort: web - protocol: TCP - - name: websecure - port: 443 - targetPort: websecure - protocol: TCP - namespace: logging name: oauth2-proxy-logs type: ClusterIP @@ -1191,15 +1447,15 @@ services: port: 4190 targetPort: 4190 protocol: TCP -- namespace: mailu-mailserver - name: mailu-sync-listener +- namespace: maintenance + name: ariadne type: ClusterIP selector: - app: mailu-sync-listener + app: ariadne ports: - name: http - port: 8080 - targetPort: 8080 + port: 80 + targetPort: http protocol: TCP - namespace: monitoring name: dcgm-exporter @@ -1291,6 +1547,10 @@ services: port: 5432 targetPort: 5432 protocol: TCP + - name: metrics + port: 9187 + targetPort: 9187 + protocol: TCP - namespace: sso name: keycloak type: ClusterIP @@ -1335,6 +1595,20 @@ services: port: 8429 targetPort: 8429 protocol: TCP +- namespace: traefik + name: traefik + type: LoadBalancer + selector: + app: traefik + ports: + - name: web + port: 80 + targetPort: web + protocol: TCP + - name: websecure + port: 443 + targetPort: websecure + protocol: TCP - namespace: traefik name: traefik-metrics type: ClusterIP @@ -1447,6 +1721,19 @@ http_endpoints: kind: Ingress name: bstein-dev-home source: bstein-dev-home +- host: budget.bstein.dev + path: / + backend: + namespace: finance + service: actual-budget + port: 80 + workloads: + - kind: Deployment + name: actual-budget + via: + kind: Ingress + name: actual-budget + source: finance - host: call.live.bstein.dev path: / backend: @@ -1499,6 +1786,19 @@ http_endpoints: kind: Ingress name: nextcloud source: nextcloud +- host: health.bstein.dev + path: / + backend: + namespace: health + service: wger + port: 80 + workloads: + - kind: Deployment + name: wger + via: + kind: Ingress + name: wger + source: health - host: kit.live.bstein.dev path: /livekit/jwt backend: @@ -1558,6 +1858,65 @@ http_endpoints: kind: Ingress name: matrix-routing source: comms +- host: live.bstein.dev + path: /_matrix/client/r0/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: &id003 + - kind: Deployment + name: matrix-guest-register + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/login + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: &id002 + - kind: Deployment + name: matrix-authentication-service + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/logout + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/refresh + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: comms - host: logs.bstein.dev path: / backend: @@ -1601,9 +1960,7 @@ http_endpoints: namespace: comms service: matrix-authentication-service port: 8080 - workloads: &id002 - - kind: Deployment - name: matrix-authentication-service + workloads: *id002 via: kind: Ingress name: matrix-routing @@ -1647,9 +2004,7 @@ http_endpoints: namespace: comms service: matrix-guest-register port: 8080 - workloads: &id003 - - kind: Deployment - name: matrix-guest-register + workloads: *id003 via: kind: Ingress name: matrix-routing @@ -1722,6 +2077,19 @@ http_endpoints: kind: Ingress name: monerod source: monerod +- host: money.bstein.dev + path: / + backend: + namespace: finance + service: firefly + port: 80 + workloads: + - kind: Deployment + name: firefly + via: + kind: Ingress + name: firefly + source: finance - host: notes.bstein.dev path: / backend: @@ -1845,7 +2213,6 @@ helmrelease_host_hints: - live.bstein.dev - matrix.live.bstein.dev comms:comms/othrys-synapse: - - bstein.dev - kit.live.bstein.dev - live.bstein.dev - matrix.live.bstein.dev @@ -1856,6 +2223,8 @@ helmrelease_host_hints: - registry.bstein.dev logging:logging/data-prepper: - registry.bstein.dev + longhorn:longhorn-system/longhorn: + - registry.bstein.dev mailu:mailu-mailserver/mailu: - bstein.dev - mail.bstein.dev @@ -1863,5 +2232,8 @@ helmrelease_host_hints: - alerts.bstein.dev monitoring:monitoring/grafana: - bstein.dev + - mail.bstein.dev - metrics.bstein.dev - sso.bstein.dev + monitoring:monitoring/kube-state-metrics: + - atlas.bstein.dev diff --git a/services/comms/knowledge/catalog/runbooks.json b/services/comms/knowledge/catalog/runbooks.json index d7356ca..0718562 100644 --- a/services/comms/knowledge/catalog/runbooks.json +++ b/services/comms/knowledge/catalog/runbooks.json @@ -20,6 +20,22 @@ ], "body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured." }, + { + "path": "runbooks/comms-verify.md", + "title": "Othrys verification checklist", + "tags": [ + "comms", + "matrix", + "element", + "livekit" + ], + "entrypoints": [ + "https://live.bstein.dev", + "https://matrix.live.bstein.dev" + ], + "source_paths": [], + "body": "1) Guest join:\n- Open a private window and visit:\n `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`\n- Confirm the guest join flow works and the displayname becomes `-`.\n\n2) Keycloak login:\n- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.\n\n3) Video rooms:\n- Start an Element Call room and confirm audio/video with a second account.\n- Check that guests can read public rooms but cannot start calls.\n\n4) Well-known:\n- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.\n- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.\n\n5) TURN reachability:\n- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN." + }, { "path": "runbooks/kb-authoring.md", "title": "KB authoring: what to write (and what not to)", diff --git a/services/comms/knowledge/diagrams/atlas-http.mmd b/services/comms/knowledge/diagrams/atlas-http.mmd index ab7c362..1aa7ac8 100644 --- a/services/comms/knowledge/diagrams/atlas-http.mmd +++ b/services/comms/knowledge/diagrams/atlas-http.mmd @@ -17,6 +17,11 @@ flowchart LR host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"] svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend + host_budget_bstein_dev["budget.bstein.dev"] + svc_finance_actual_budget["finance/actual-budget (Service)"] + host_budget_bstein_dev --> svc_finance_actual_budget + wl_finance_actual_budget["finance/actual-budget (Deployment)"] + svc_finance_actual_budget --> wl_finance_actual_budget host_call_live_bstein_dev["call.live.bstein.dev"] svc_comms_element_call["comms/element-call (Service)"] host_call_live_bstein_dev --> svc_comms_element_call @@ -37,6 +42,11 @@ flowchart LR host_cloud_bstein_dev --> svc_nextcloud_nextcloud wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"] svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud + host_health_bstein_dev["health.bstein.dev"] + svc_health_wger["health/wger (Service)"] + host_health_bstein_dev --> svc_health_wger + wl_health_wger["health/wger (Deployment)"] + svc_health_wger --> wl_health_wger host_kit_live_bstein_dev["kit.live.bstein.dev"] svc_comms_livekit_token_service["comms/livekit-token-service (Service)"] host_kit_live_bstein_dev --> svc_comms_livekit_token_service @@ -50,6 +60,14 @@ flowchart LR host_live_bstein_dev --> svc_comms_matrix_wellknown svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"] host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] + host_live_bstein_dev --> svc_comms_matrix_guest_register + wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] + svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] + host_live_bstein_dev --> svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] + svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_logs_bstein_dev["logs.bstein.dev"] svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"] host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs @@ -64,21 +82,20 @@ flowchart LR svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"] host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front host_matrix_live_bstein_dev["matrix.live.bstein.dev"] - svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] - svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register - wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] - svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register host_monero_bstein_dev["monero.bstein.dev"] svc_crypto_monerod["crypto/monerod (Service)"] host_monero_bstein_dev --> svc_crypto_monerod wl_crypto_monerod["crypto/monerod (Deployment)"] svc_crypto_monerod --> wl_crypto_monerod + host_money_bstein_dev["money.bstein.dev"] + svc_finance_firefly["finance/firefly (Service)"] + host_money_bstein_dev --> svc_finance_firefly + wl_finance_firefly["finance/firefly (Deployment)"] + svc_finance_firefly --> wl_finance_firefly host_notes_bstein_dev["notes.bstein.dev"] svc_outline_outline["outline/outline (Service)"] host_notes_bstein_dev --> svc_outline_outline @@ -143,19 +160,29 @@ flowchart LR svc_comms_livekit wl_comms_livekit svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service svc_comms_matrix_guest_register wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service end subgraph crypto[crypto] svc_crypto_monerod wl_crypto_monerod end + subgraph finance[finance] + svc_finance_actual_budget + wl_finance_actual_budget + svc_finance_firefly + wl_finance_firefly + end subgraph gitea[gitea] svc_gitea_gitea wl_gitea_gitea end + subgraph health[health] + svc_health_wger + wl_health_wger + end subgraph jellyfin[jellyfin] svc_jellyfin_pegasus wl_jellyfin_pegasus diff --git a/services/comms/knowledge/metis.md b/services/comms/knowledge/metis.md new file mode 100644 index 0000000..5b0d06b --- /dev/null +++ b/services/comms/knowledge/metis.md @@ -0,0 +1,26 @@ +# Metis (node recovery) + +## Node classes (current map) +- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent) +- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint) +- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks) +- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent) +- amd64 agents: titan-22/24 (Debian 13, k3s agent) +- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers. + +## Longhorn disk UUIDs (critical nodes) +- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4) +- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4) +- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4) +- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4) + +## Metis repo (~/Development/metis) +- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`). +- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints). +- `AGENTS.md` in repo is untracked and holds raw notes. + +## Next implementation steps +- Add per-class golden image refs and checksums (Harbor or file://) when ready. +- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths. +- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection. +- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited. diff --git a/services/comms/knowledge/runbooks/comms-verify.md b/services/comms/knowledge/runbooks/comms-verify.md new file mode 100644 index 0000000..8c09d0a --- /dev/null +++ b/services/comms/knowledge/runbooks/comms-verify.md @@ -0,0 +1,30 @@ +--- +title: Othrys verification checklist +tags: + - comms + - matrix + - element + - livekit +entrypoints: + - https://live.bstein.dev + - https://matrix.live.bstein.dev +--- + +1) Guest join: +- Open a private window and visit: + `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join` +- Confirm the guest join flow works and the displayname becomes `-`. + +2) Keycloak login: +- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect. + +3) Video rooms: +- Start an Element Call room and confirm audio/video with a second account. +- Check that guests can read public rooms but cannot start calls. + +4) Well-known: +- `https://live.bstein.dev/.well-known/matrix/client` returns JSON. +- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON. + +5) TURN reachability: +- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN. diff --git a/services/comms/knowledge/software/metis.md b/services/comms/knowledge/software/metis.md new file mode 100644 index 0000000..7ca3b39 --- /dev/null +++ b/services/comms/knowledge/software/metis.md @@ -0,0 +1,73 @@ +# Metis (node recovery) + +## Node classes (current map) +- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent) +- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint) +- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks) +- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent) +- amd64 agents: titan-22/24 (Debian 13, k3s agent) +- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers. + +### Jetson nodes (titan-20/21) +- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64. +- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused). +- k3s agent with drop-in 99-nofile.conf. + +## Longhorn disk UUIDs (critical nodes) +- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4) +- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4) +- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4) +- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4) + +## Metis repo (~/Development/metis) +- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`). +- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints). +- `AGENTS.md` in repo is untracked and holds raw notes. + +## Next implementation steps +- Add per-class golden image refs and checksums (Harbor or file://) when ready. +- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths. +- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection. +- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited. + +## Node OS/Kernel/CRI snapshot (Jan 2026) +- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64 +- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64 + + +### External hosts +- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled. +- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q). +- titan-23/oceanus: TODO audit (future). + + +### Control plane Pis (titan-0a/0b/0c) +- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2. +- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot. +- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO). + + +## k3s versions +- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2) +- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2) +- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2 diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6fb6bff..e077620 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -19,6 +19,8 @@ API_KEY = os.environ.get("CHAT_API_KEY", "") KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") +ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "") +ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "") BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas") SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") @@ -297,6 +299,21 @@ def k8s_get(path: str, timeout: int = 8) -> dict: raw = resp.read() return json.loads(raw.decode()) if raw else {} +def _ariadne_state(timeout: int = 5) -> dict | None: + if not ARIADNE_STATE_URL: + return None + headers = {} + if ARIADNE_STATE_TOKEN: + headers["X-Internal-Token"] = ARIADNE_STATE_TOKEN + r = request.Request(ARIADNE_STATE_URL, headers=headers, method="GET") + try: + with request.urlopen(r, timeout=timeout) as resp: + raw = resp.read() + payload = json.loads(raw.decode()) if raw else {} + return payload if isinstance(payload, dict) else None + except Exception: + return None + def k8s_pods(namespace: str) -> list[dict]: data = k8s_get(f"/api/v1/namespaces/{parse.quote(namespace)}/pods?limit=500") items = data.get("items") or [] @@ -445,6 +462,17 @@ def vm_cluster_snapshot() -> str: return "\n".join(parts).strip() def nodes_summary(cluster_name: str) -> str: + state = _ariadne_state() + if state: + nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {} + total = nodes.get("total") + ready = nodes.get("ready") + not_ready = nodes.get("not_ready") + if isinstance(total, int) and isinstance(ready, int): + not_ready = not_ready if isinstance(not_ready, int) else max(total - ready, 0) + if not_ready: + return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady." + return f"{cluster_name} cluster has {total} nodes, all Ready." try: data = k8s_get("/api/v1/nodes?limit=500") except Exception: @@ -467,6 +495,16 @@ def nodes_summary(cluster_name: str) -> str: return f"{cluster_name} cluster has {total} nodes, all Ready." def nodes_names_summary(cluster_name: str) -> str: + state = _ariadne_state() + if state: + nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {} + names = nodes.get("names") + if isinstance(names, list) and names: + cleaned = sorted({str(n) for n in names if n}) + if len(cleaned) <= 30: + return f"{cluster_name} node names: {', '.join(cleaned)}." + shown = ", ".join(cleaned[:30]) + return f"{cluster_name} node names: {shown}, … (+{len(cleaned) - 30} more)." try: data = k8s_get("/api/v1/nodes?limit=500") except Exception: diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 52d10f9..0356e06 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -311,10 +311,18 @@ spec: value: "0 0 1 1 *" - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM value: "*/10 * * * *" + - name: ARIADNE_SCHEDULE_CLUSTER_STATE + value: "*/15 * * * *" + - name: ARIADNE_CLUSTER_STATE_KEEP + value: "168" - name: WELCOME_EMAIL_ENABLED value: "true" - name: K8S_API_TIMEOUT_SEC value: "5" + - name: ARIADNE_VM_URL + value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 + - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC + value: "5" - name: OPENSEARCH_URL value: http://opensearch-master.logging.svc.cluster.local:9200 - name: OPENSEARCH_LIMIT_BYTES diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml index 88689cb..33620d0 100644 --- a/services/maintenance/ariadne-rbac.yaml +++ b/services/maintenance/ariadne-rbac.yaml @@ -21,12 +21,27 @@ rules: - list - watch - delete + - apiGroups: [""] + resources: + - nodes + - namespaces + verbs: + - get + - list + - watch - apiGroups: [""] resources: - pods/exec verbs: - get - create + - apiGroups: ["kustomize.toolkit.fluxcd.io"] + resources: + - kustomizations + verbs: + - get + - list + - watch --- apiVersion: rbac.authorization.k8s.io/v1 From f14be5d7ef210ac288718f970c82a7f3cb4698b2 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Mon, 26 Jan 2026 06:33:26 +0000 Subject: [PATCH 230/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 4e261cb..3933caf 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-48 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-49 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 7fd71f4bab4a056dbd68b347d2758e6ed8300c28 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 09:23:21 -0300 Subject: [PATCH 231/416] comms: inject chat ai keys for atlasbot --- services/comms/atlasbot-deployment.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 70844eb..aec7b79 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -25,6 +25,12 @@ spec: vault.hashicorp.com/agent-inject-secret-livekit-primary: "kv/data/atlas/comms/livekit-api" vault.hashicorp.com/agent-inject-template-livekit-primary: | {{- with secret "kv/data/atlas/comms/livekit-api" -}}{{ .Data.data.primary }}{{- end -}} + vault.hashicorp.com/agent-inject-secret-chat-matrix: "kv/data/atlas/shared/chat-ai-keys-runtime" + vault.hashicorp.com/agent-inject-template-chat-matrix: | + {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.matrix }}{{- end -}} + vault.hashicorp.com/agent-inject-secret-chat-homepage: "kv/data/atlas/shared/chat-ai-keys-runtime" + vault.hashicorp.com/agent-inject-template-chat-homepage: | + {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.homepage }}{{- end -}} vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-template-bot-pass: | {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}} From 14d18048d5390b9e468e462dc8ed73ae8aed9b0a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 09:29:28 -0300 Subject: [PATCH 232/416] comms: fix duplicate chat key annotations --- services/comms/atlasbot-deployment.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index aec7b79..70844eb 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -25,12 +25,6 @@ spec: vault.hashicorp.com/agent-inject-secret-livekit-primary: "kv/data/atlas/comms/livekit-api" vault.hashicorp.com/agent-inject-template-livekit-primary: | {{- with secret "kv/data/atlas/comms/livekit-api" -}}{{ .Data.data.primary }}{{- end -}} - vault.hashicorp.com/agent-inject-secret-chat-matrix: "kv/data/atlas/shared/chat-ai-keys-runtime" - vault.hashicorp.com/agent-inject-template-chat-matrix: | - {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.matrix }}{{- end -}} - vault.hashicorp.com/agent-inject-secret-chat-homepage: "kv/data/atlas/shared/chat-ai-keys-runtime" - vault.hashicorp.com/agent-inject-template-chat-homepage: | - {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.homepage }}{{- end -}} vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-template-bot-pass: | {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}} From 352d4991f42a9ffe89fb585f0d0e17642e853828 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 09:36:08 -0300 Subject: [PATCH 233/416] comms: handle arch node counts and extend LLM timeout --- services/comms/scripts/atlasbot/bot.py | 39 +++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e077620..797b601 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -16,6 +16,7 @@ ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") API_KEY = os.environ.get("CHAT_API_KEY", "") +OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "90")) KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") @@ -525,6 +526,29 @@ def nodes_names_summary(cluster_name: str) -> str: shown = ", ".join(names[:30]) return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)." + +def nodes_arch_summary(cluster_name: str, arch: str) -> str: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return "" + items = data.get("items") or [] + if not isinstance(items, list) or not items: + return "" + normalized = (arch or "").strip().lower() + if normalized in ("aarch64", "arm64"): + arch_label = "arm64" + elif normalized in ("x86_64", "x86-64", "amd64"): + arch_label = "amd64" + else: + arch_label = normalized + total = 0 + for node in items: + labels = (node.get("metadata") or {}).get("labels") or {} + if labels.get("kubernetes.io/arch") == arch_label: + total += 1 + return f"{cluster_name} cluster has {total} {arch_label} nodes." + def _strip_code_fence(text: str) -> str: cleaned = (text or "").strip() match = CODE_FENCE_RE.match(cleaned) @@ -622,7 +646,7 @@ def ollama_reply(hist_key, prompt: str, *, context: str) -> str: if API_KEY: headers["x-api-key"] = API_KEY r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) - with request.urlopen(r, timeout=20) as resp: + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: data = json.loads(resp.read().decode()) raw_reply = data.get("message") or data.get("response") or data.get("reply") or data reply = _normalize_reply(raw_reply) or "I'm here to help." @@ -692,6 +716,19 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue + if "node" in lower_body and any(word in lower_body for word in ("arm64", "aarch64", "amd64", "x86_64", "x86-64")): + if any(word in lower_body for word in ("cluster", "atlas", "titan")): + arch = "arm64" if "arm64" in lower_body or "aarch64" in lower_body else "amd64" + summary = nodes_arch_summary("Atlas", arch) + if not summary: + send_msg( + token, + rid, + "I couldn’t reach the cluster API to count nodes by architecture. Try again in a moment.", + ) + continue + send_msg(token, rid, summary) + continue if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body): if any(word in lower_body for word in ("cluster", "atlas", "titan")): names_summary = nodes_names_summary("Atlas") From fec7713049da226e3df814d213ed2b6e4ca5740c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 09:38:38 -0300 Subject: [PATCH 234/416] comms: bump atlasbot configmap checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 70844eb..a8a3009 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-9 + checksum/atlasbot-configmap: manual-atlasbot-10 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 2c3ffdbf955fbdc5ef8da59cd28efab2324c5857 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 11:44:28 -0300 Subject: [PATCH 235/416] ai-llm: tighten gpu placement and resources --- services/ai-llm/deployment.yaml | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml index dfa1bdd..4f34d86 100644 --- a/services/ai-llm/deployment.yaml +++ b/services/ai-llm/deployment.yaml @@ -21,8 +21,8 @@ spec: app: ollama annotations: ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0 - ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24) - ai.bstein.dev/restartedAt: "2026-01-25T19:10:00Z" + ai.bstein.dev/gpu: GPU pool (titan-22/24) + ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z" spec: affinity: nodeAffinity: @@ -32,8 +32,6 @@ spec: - key: kubernetes.io/hostname operator: In values: - - titan-20 - - titan-21 - titan-22 - titan-24 runtimeClassName: nvidia @@ -69,8 +67,8 @@ spec: mountPath: /root/.ollama resources: requests: - cpu: 250m - memory: 1Gi + cpu: 500m + memory: 2Gi nvidia.com/gpu.shared: 1 limits: nvidia.com/gpu.shared: 1 @@ -97,10 +95,10 @@ spec: mountPath: /root/.ollama resources: requests: - cpu: "2" - memory: 8Gi + cpu: "4" + memory: 16Gi nvidia.com/gpu.shared: 1 limits: - cpu: "4" - memory: 12Gi + cpu: "8" + memory: 24Gi nvidia.com/gpu.shared: 1 From 28570a1f5cf1210f72820257590f6e89f4d75da3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 12:06:48 -0300 Subject: [PATCH 236/416] atlasbot: answer jetson nodes from knowledge --- knowledge/catalog/atlas.json | 2 +- knowledge/catalog/atlas.yaml | 2 +- knowledge/catalog/runbooks.json | 8 ++++ scripts/knowledge_render_atlas.py | 9 ++++- services/comms/knowledge/catalog/atlas.json | 2 +- services/comms/knowledge/catalog/atlas.yaml | 2 +- .../comms/knowledge/catalog/runbooks.json | 8 ++++ services/comms/scripts/atlasbot/bot.py | 39 +++++++++++++++++++ 8 files changed, 66 insertions(+), 6 deletions(-) diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json index 21ac407..951c807 100644 --- a/knowledge/catalog/atlas.json +++ b/knowledge/catalog/atlas.json @@ -1057,7 +1057,7 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/ariadne:0.1.0-48" + "registry.bstein.dev/bstein/ariadne:0.1.0-49" ] }, { diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml index b3b0119..637b5f9 100644 --- a/knowledge/catalog/atlas.yaml +++ b/knowledge/catalog/atlas.yaml @@ -711,7 +711,7 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/ariadne:0.1.0-48 + - registry.bstein.dev/bstein/ariadne:0.1.0-49 - kind: Deployment namespace: maintenance name: maintenance-vault-sync diff --git a/knowledge/catalog/runbooks.json b/knowledge/catalog/runbooks.json index 0718562..960510d 100644 --- a/knowledge/catalog/runbooks.json +++ b/knowledge/catalog/runbooks.json @@ -85,5 +85,13 @@ "clusters/atlas/<...>" ], "body": "# \n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)" + }, + { + "path": "software/metis.md", + "title": "metis", + "tags": [], + "entrypoints": [], + "source_paths": [], + "body": "# Metis (node recovery)\n\n## Node classes (current map)\n- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)\n- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)\n- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)\n- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)\n- amd64 agents: titan-22/24 (Debian 13, k3s agent)\n- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.\n\n### Jetson nodes (titan-20/21)\n- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.\n- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).\n- k3s agent with drop-in 99-nofile.conf.\n\n## Longhorn disk UUIDs (critical nodes)\n- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)\n- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)\n- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)\n- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)\n\n## Metis repo (~/Development/metis)\n- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).\n- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).\n- `AGENTS.md` in repo is untracked and holds raw notes.\n\n## Next implementation steps\n- Add per-class golden image refs and checksums (Harbor or file://) when ready.\n- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.\n- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.\n- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.\n\n## Node OS/Kernel/CRI snapshot (Jan 2026)\n- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n\n\n### External hosts\n- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.\n- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).\n- titan-23/oceanus: TODO audit (future).\n\n\n### Control plane Pis (titan-0a/0b/0c)\n- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.\n- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.\n- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).\n\n\n## k3s versions\n- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)\n- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)\n- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2" } ] diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py index 34938e7..206dcd9 100644 --- a/scripts/knowledge_render_atlas.py +++ b/scripts/knowledge_render_atlas.py @@ -529,9 +529,14 @@ def main() -> int: diagram_path.write_text(diagram, encoding="utf-8") # Render runbooks into JSON for lightweight, dependency-free consumption in-cluster. - runbooks_dir = out_dir / "runbooks" + runbook_dirs = [ + out_dir / "runbooks", + out_dir / "software", + ] runbooks: list[dict[str, Any]] = [] - if runbooks_dir.exists(): + for runbooks_dir in runbook_dirs: + if not runbooks_dir.exists(): + continue for md_file in sorted(runbooks_dir.glob("*.md")): raw = md_file.read_text(encoding="utf-8") fm: dict[str, Any] = {} diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json index 21ac407..951c807 100644 --- a/services/comms/knowledge/catalog/atlas.json +++ b/services/comms/knowledge/catalog/atlas.json @@ -1057,7 +1057,7 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/ariadne:0.1.0-48" + "registry.bstein.dev/bstein/ariadne:0.1.0-49" ] }, { diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml index b3b0119..637b5f9 100644 --- a/services/comms/knowledge/catalog/atlas.yaml +++ b/services/comms/knowledge/catalog/atlas.yaml @@ -711,7 +711,7 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/ariadne:0.1.0-48 + - registry.bstein.dev/bstein/ariadne:0.1.0-49 - kind: Deployment namespace: maintenance name: maintenance-vault-sync diff --git a/services/comms/knowledge/catalog/runbooks.json b/services/comms/knowledge/catalog/runbooks.json index 0718562..960510d 100644 --- a/services/comms/knowledge/catalog/runbooks.json +++ b/services/comms/knowledge/catalog/runbooks.json @@ -85,5 +85,13 @@ "clusters/atlas/<...>" ], "body": "# \n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)" + }, + { + "path": "software/metis.md", + "title": "metis", + "tags": [], + "entrypoints": [], + "source_paths": [], + "body": "# Metis (node recovery)\n\n## Node classes (current map)\n- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)\n- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)\n- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)\n- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)\n- amd64 agents: titan-22/24 (Debian 13, k3s agent)\n- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.\n\n### Jetson nodes (titan-20/21)\n- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.\n- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).\n- k3s agent with drop-in 99-nofile.conf.\n\n## Longhorn disk UUIDs (critical nodes)\n- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)\n- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)\n- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)\n- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)\n\n## Metis repo (~/Development/metis)\n- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).\n- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).\n- `AGENTS.md` in repo is untracked and holds raw notes.\n\n## Next implementation steps\n- Add per-class golden image refs and checksums (Harbor or file://) when ready.\n- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.\n- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.\n- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.\n\n## Node OS/Kernel/CRI snapshot (Jan 2026)\n- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n\n\n### External hosts\n- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.\n- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).\n- titan-23/oceanus: TODO audit (future).\n\n\n### Control plane Pis (titan-0a/0b/0c)\n- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.\n- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.\n- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).\n\n\n## k3s versions\n- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)\n- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)\n- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2" } ] diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 797b601..18ec611 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -75,6 +75,8 @@ METRIC_HINT_WORDS = { } CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL) +TITAN_NODE_RE = re.compile(r"\\btitan-[0-9a-z]{2}\\b", re.IGNORECASE) +TITAN_RANGE_RE = re.compile(r"\\btitan-([0-9a-z]{2})/([0-9a-z]{2})\\b", re.IGNORECASE) def _tokens(text: str) -> list[str]: toks = [t.lower() for t in TOKEN_RE.findall(text or "")] @@ -233,6 +235,35 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: used += len(chunk) return "\n".join(parts).strip() +def _extract_titan_nodes(text: str) -> list[str]: + names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n} + for match in TITAN_RANGE_RE.finditer(text or ""): + left, right = match.groups() + if left: + names.add(f"titan-{left.lower()}") + if right: + names.add(f"titan-{right.lower()}") + return sorted(names) + +def jetson_nodes_from_kb() -> list[str]: + for doc in KB.get("runbooks", []): + if not isinstance(doc, dict): + continue + body = str(doc.get("body") or "") + for line in body.splitlines(): + if "jetson" not in line.lower(): + continue + names = _extract_titan_nodes(line) + if names: + return names + return [] + +def jetson_nodes_summary(cluster_name: str) -> str: + names = jetson_nodes_from_kb() + if names: + return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}." + return "" + def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]: q = (query or "").strip() if not q or not KB.get("catalog"): @@ -729,6 +760,14 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue + if "jetson" in lower_body: + if any(word in lower_body for word in ("cluster", "atlas", "titan", "node", "nodes")): + summary = jetson_nodes_summary("Atlas") + if summary: + send_msg(token, rid, summary) + else: + send_msg(token, rid, "Jetson inventory is not available in the knowledge base yet.") + continue if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body): if any(word in lower_body for word in ("cluster", "atlas", "titan")): names_summary = nodes_names_summary("Atlas") From 53e4b4036b73324f8632d595da8ea368d2c1749c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 12:08:33 -0300 Subject: [PATCH 237/416] comms: bump atlasbot config checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index a8a3009..c96c79c 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-10 + checksum/atlasbot-configmap: manual-atlasbot-11 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From fff00dbe95d1bc2147e9f0352d6a7c98e27e33b7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 12:36:51 -0300 Subject: [PATCH 238/416] atlasbot: ground node inventory and soften llm failures --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 216 ++++++++++++++++++++---- 2 files changed, 181 insertions(+), 37 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index c96c79c..2c08853 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-11 + checksum/atlasbot-configmap: manual-atlasbot-12 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 18ec611..8edc28d 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3,6 +3,7 @@ import json import os import re import ssl +import threading import time from typing import Any from urllib import error, parse, request @@ -156,6 +157,13 @@ def send_msg(token: str, room: str, text: str): KB = {"catalog": {}, "runbooks": []} _HOST_INDEX: dict[str, list[dict]] = {} _NAME_INDEX: set[str] = set() +_NODE_CLASS_INDEX: dict[str, list[str]] = {} +_NODE_CLASS_RPI4: set[str] = set() +_NODE_CLASS_RPI5: set[str] = set() +_NODE_CLASS_AMD64: set[str] = set() +_NODE_CLASS_JETSON: set[str] = set() +_NODE_CLASS_EXTERNAL: set[str] = set() +_NODE_CLASS_NON_RPI: set[str] = set() def _load_json_file(path: str) -> Any | None: try: @@ -166,6 +174,8 @@ def _load_json_file(path: str) -> Any | None: def load_kb(): global KB, _HOST_INDEX, _NAME_INDEX + global _NODE_CLASS_INDEX, _NODE_CLASS_RPI4, _NODE_CLASS_RPI5, _NODE_CLASS_AMD64, _NODE_CLASS_JETSON + global _NODE_CLASS_EXTERNAL, _NODE_CLASS_NON_RPI if not KB_DIR: return catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {} @@ -188,6 +198,24 @@ def load_kb(): names.add(str(w["name"]).lower()) _NAME_INDEX = names + node_classes = _parse_node_classes(runbooks) + _NODE_CLASS_INDEX = node_classes + _NODE_CLASS_RPI4 = set(node_classes.get("rpi4", [])) + _NODE_CLASS_RPI5 = set(node_classes.get("rpi5", [])) + _NODE_CLASS_AMD64 = set(node_classes.get("amd64", [])) + _NODE_CLASS_JETSON = set(node_classes.get("jetson", [])) + _NODE_CLASS_EXTERNAL = set(node_classes.get("external", [])) + _NODE_CLASS_NON_RPI = set( + sorted( + ( + set().union(*node_classes.values()) + - _NODE_CLASS_RPI4 + - _NODE_CLASS_RPI5 + - _NODE_CLASS_EXTERNAL + ) + ) + ) + def kb_retrieve(query: str, *, limit: int = 3) -> str: q = (query or "").strip() if not q or not KB.get("runbooks"): @@ -237,6 +265,12 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: def _extract_titan_nodes(text: str) -> list[str]: names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n} + for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", text or "", re.IGNORECASE): + tail = match.group(1) + for part in re.split(r"[/,]", tail): + part = part.strip() + if part: + names.add(f"titan-{part.lower()}") for match in TITAN_RANGE_RE.finditer(text or ""): left, right = match.groups() if left: @@ -245,6 +279,83 @@ def _extract_titan_nodes(text: str) -> list[str]: names.add(f"titan-{right.lower()}") return sorted(names) +def _parse_node_classes(runbooks: list[dict[str, Any]]) -> dict[str, list[str]]: + classes: dict[str, list[str]] = {} + for doc in runbooks: + if not isinstance(doc, dict): + continue + body = str(doc.get("body") or "") + for line in body.splitlines(): + stripped = line.strip() + if "titan-" not in stripped.lower(): + continue + label = "" + nodes: list[str] = [] + if stripped.startswith("-") and ":" in stripped: + label, rest = stripped.lstrip("-").split(":", 1) + nodes = _extract_titan_nodes(rest) + label = label.strip().lower() + else: + nodes = _extract_titan_nodes(stripped) + if not nodes: + continue + if "jetson" in stripped.lower(): + classes.setdefault("jetson", nodes) + if "amd64" in stripped.lower() or "x86" in stripped.lower(): + classes.setdefault("amd64", nodes) + if "rpi4" in stripped.lower(): + classes.setdefault("rpi4", nodes) + if "rpi5" in stripped.lower(): + classes.setdefault("rpi5", nodes) + if "external" in stripped.lower() or "non-cluster" in stripped.lower(): + classes.setdefault("external", nodes) + if label: + classes.setdefault(label, nodes) + return {k: sorted(set(v)) for k, v in classes.items()} + +def node_inventory_answer(cluster_name: str, query: str) -> str: + q = (query or "").lower() + if "jetson" in q and _NODE_CLASS_JETSON: + names = sorted(_NODE_CLASS_JETSON) + return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}." + if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: + names = sorted(_NODE_CLASS_NON_RPI) + if names: + return f"{cluster_name} non‑Raspberry Pi nodes: {', '.join(names)}." + if "raspberry" in q or "rpi" in q: + if "rpi4" in q and _NODE_CLASS_RPI4: + names = sorted(_NODE_CLASS_RPI4) + return f"{cluster_name} rpi4 nodes: {', '.join(names)}." + if "rpi5" in q and _NODE_CLASS_RPI5: + names = sorted(_NODE_CLASS_RPI5) + return f"{cluster_name} rpi5 nodes: {', '.join(names)}." + names = sorted(_NODE_CLASS_RPI4 | _NODE_CLASS_RPI5) + if names: + return f"{cluster_name} Raspberry Pi nodes: {', '.join(names)}." + if ("amd64" in q or "x86" in q) and _NODE_CLASS_AMD64: + names = sorted(_NODE_CLASS_AMD64) + return f"{cluster_name} amd64 nodes: {', '.join(names)}." + return "" + +def node_inventory_context(query: str) -> str: + q = (query or "").lower() + if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "x86", "cluster")): + return "" + lines: list[str] = ["Node inventory (KB):"] + if _NODE_CLASS_RPI5: + lines.append(f"- rpi5: {', '.join(sorted(_NODE_CLASS_RPI5))}") + if _NODE_CLASS_RPI4: + lines.append(f"- rpi4: {', '.join(sorted(_NODE_CLASS_RPI4))}") + if _NODE_CLASS_JETSON: + lines.append(f"- jetson: {', '.join(sorted(_NODE_CLASS_JETSON))}") + if _NODE_CLASS_AMD64: + lines.append(f"- amd64: {', '.join(sorted(_NODE_CLASS_AMD64))}") + if _NODE_CLASS_EXTERNAL: + lines.append(f"- external: {', '.join(sorted(_NODE_CLASS_EXTERNAL))}") + if len(lines) == 1: + return "" + return "\n".join(lines) + def jetson_nodes_from_kb() -> list[str]: for doc in KB.get("runbooks", []): if not isinstance(doc, dict): @@ -627,6 +738,10 @@ def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, st if endpoints: parts.append(endpoints) + inventory = node_inventory_context(prompt) + if inventory: + parts.append(inventory) + if allow_tools: # Scope pod summaries to relevant namespaces/workloads when possible. prefixes_by_ns: dict[str, set[str]] = collections.defaultdict(set) @@ -656,35 +771,58 @@ def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, st return "\n\n".join([p for p in parts if p]).strip() -def ollama_reply(hist_key, prompt: str, *, context: str) -> str: - try: - system = ( - "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " - "Be helpful, direct, and concise. " - "Prefer answering with exact repo paths and Kubernetes resource names. " - "Never include or request secret values. " - "Respond in plain sentences; do not return JSON or code fences unless explicitly asked." - ) - transcript_parts = [system] - if context: - transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS]) - transcript_parts.extend(history[hist_key][-24:]) - transcript_parts.append(f"User: {prompt}") - transcript = "\n".join(transcript_parts) +def _ollama_call(hist_key, prompt: str, *, context: str) -> str: + system = ( + "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " + "Be helpful, direct, and concise. " + "Prefer answering with exact repo paths and Kubernetes resource names. " + "Never include or request secret values. " + "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " + "If the answer is not grounded in the provided context or tool data, say you do not know." + ) + transcript_parts = [system] + if context: + transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS]) + transcript_parts.extend(history[hist_key][-24:]) + transcript_parts.append(f"User: {prompt}") + transcript = "\n".join(transcript_parts) - payload = {"model": MODEL, "message": transcript} - headers = {"Content-Type": "application/json"} - if API_KEY: - headers["x-api-key"] = API_KEY - r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) - with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: - data = json.loads(resp.read().decode()) - raw_reply = data.get("message") or data.get("response") or data.get("reply") or data - reply = _normalize_reply(raw_reply) or "I'm here to help." - history[hist_key].append(f"Atlas: {reply}") - return reply + payload = {"model": MODEL, "message": transcript} + headers = {"Content-Type": "application/json"} + if API_KEY: + headers["x-api-key"] = API_KEY + r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: + data = json.loads(resp.read().decode()) + raw_reply = data.get("message") or data.get("response") or data.get("reply") or data + reply = _normalize_reply(raw_reply) or "I'm here to help." + history[hist_key].append(f"Atlas: {reply}") + return reply + +def ollama_reply(hist_key, prompt: str, *, context: str, fallback: str = "") -> str: + try: + return _ollama_call(hist_key, prompt, context=context) except Exception: - return "I’m here — but I couldn’t reach the model backend." + if fallback: + history[hist_key].append(f"Atlas: {fallback}") + return fallback + return "Model backend is busy. Try again in a moment." + +def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, context: str, fallback: str) -> str: + result: dict[str, str] = {"reply": ""} + done = threading.Event() + + def worker(): + result["reply"] = ollama_reply(hist_key, prompt, context=context, fallback=fallback) + done.set() + + thread = threading.Thread(target=worker, daemon=True) + thread.start() + if not done.wait(2.0): + send_msg(token, room, "Thinking…") + done.wait() + thread.join(timeout=1) + return result["reply"] or fallback or "Model backend is busy. Try again in a moment." def sync_loop(token: str, room_id: str): since = None @@ -747,6 +885,10 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue + inventory_answer = node_inventory_answer("Atlas", lower_body) + if inventory_answer: + send_msg(token, rid, inventory_answer) + continue if "node" in lower_body and any(word in lower_body for word in ("arm64", "aarch64", "amd64", "x86_64", "x86-64")): if any(word in lower_body for word in ("cluster", "atlas", "titan")): arch = "arm64" if "arm64" in lower_body or "aarch64" in lower_body else "amd64" @@ -760,14 +902,6 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue - if "jetson" in lower_body: - if any(word in lower_body for word in ("cluster", "atlas", "titan", "node", "nodes")): - summary = jetson_nodes_summary("Atlas") - if summary: - send_msg(token, rid, summary) - else: - send_msg(token, rid, "Jetson inventory is not available in the knowledge base yet.") - continue if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body): if any(word in lower_body for word in ("cluster", "atlas", "titan")): names_summary = nodes_names_summary("Atlas") @@ -803,7 +937,17 @@ def sync_loop(token: str, room_id: str): rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra - reply = ollama_reply(hist_key, body, context=context) + fallback = "" + if "node" in lower_body or "cluster" in lower_body: + fallback = node_inventory_answer("Atlas", lower_body) + reply = ollama_reply_with_thinking( + token, + rid, + hist_key, + body, + context=context, + fallback=fallback, + ) send_msg(token, rid, reply) def login_with_retry(): From 33b5e2b678191b9c357f9ea64c8a03c02f9eae23 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 14:08:11 -0300 Subject: [PATCH 239/416] atlasbot: add metrics kb and long timeout --- knowledge/catalog/metrics.json | 1880 +++++++++++++++++ scripts/knowledge_render_atlas.py | 65 + .../bstein-dev-home/backend-deployment.yaml | 4 +- .../chat-ai-gateway-deployment.yaml | 2 + services/bstein-dev-home/scripts/gateway.py | 3 +- services/comms/atlasbot-deployment.yaml | 8 +- services/comms/knowledge/catalog/metrics.json | 1880 +++++++++++++++++ services/comms/kustomization.yaml | 1 + services/comms/scripts/atlasbot/bot.py | 97 +- 9 files changed, 3934 insertions(+), 6 deletions(-) create mode 100644 knowledge/catalog/metrics.json create mode 100644 services/comms/knowledge/catalog/metrics.json diff --git a/knowledge/catalog/metrics.json b/knowledge/catalog/metrics.json new file mode 100644 index 0000000..e929db5 --- /dev/null +++ b/knowledge/catalog/metrics.json @@ -0,0 +1,1880 @@ +[ + { + "dashboard": "Atlas GPU", + "panel_title": "Namespace GPU Share", + "panel_id": 1, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Namespace", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Node", + "panel_id": 3, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "Top Pods by GPU Util", + "panel_id": 4, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (range)", + "panel_id": 1, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 3, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Stale (>36h)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Missing Success", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Suspended", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (24h)", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Runs (1h)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Error (hours ago)", + "panel_id": 10, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Success (hours ago)", + "panel_id": 11, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Success (hours ago)", + "panel_id": 12, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Schedule (hours ago)", + "panel_id": 13, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 14, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (30d)", + "panel_id": 15, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Access Requests", + "panel_id": 16, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(ariadne_access_requests_total)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Coverage (%)", + "panel_id": 17, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_coverage_percent{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Tests (latest)", + "panel_id": 18, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_tests_total{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (7d)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"7d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Mail Bounces (1d)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Success Rate (1d)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Limit Used (30d)", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Send Limit (30d)", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Last Success", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_last_success_timestamp_seconds)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounce Rate (1d vs 7d)", + "panel_id": 13, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounce_rate)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounced (1d vs 7d)", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounced)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d vs 7d)", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_sent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Success Rate (5m)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (1h)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (6h)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Edge P99 Latency (ms)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Traffic", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Egress Traffic", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Intra-Cluster Traffic", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Per-Node Throughput", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Namespaces", + "panel_id": 9, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Pods", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Routers (req/s)", + "panel_id": 11, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Entrypoints (req/s)", + "panel_id": 12, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Worker Nodes Ready", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server 5xx rate", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server P99 latency", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "etcd P99 latency", + "panel_id": 11, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node CPU", + "panel_id": 4, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node RAM", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) CPU", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) RAM", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Root Filesystem Usage", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Stuck Terminating", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Atlas Availability", + "panel_id": 27, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Problem Pods", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Workers Ready", + "panel_id": 1, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: CPU", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: RAM", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: NET (rx+tx)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: I/O (r+w)", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Sent (1d)", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Bounces (1d)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Success Rate (1d)", + "panel_id": 32, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Limit Used (30d)", + "panel_id": 33, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Connections Used", + "panel_id": 34, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Hottest Connections", + "panel_id": 35, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(1, sum by (datname) (pg_stat_activity_count))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Usage", + "panel_id": 23, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Usage", + "panel_id": 24, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Free", + "panel_id": 25, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Free", + "panel_id": 26, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 40, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 41, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Test Success Rate", + "panel_id": 42, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Tests with Failures (24h)", + "panel_id": 43, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace CPU Share", + "panel_id": 11, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace GPU Share", + "panel_id": 12, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace RAM Share", + "panel_id": 13, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node CPU", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node RAM", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane CPU", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane RAM", + "panel_id": 17, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Node Pod Share", + "panel_id": 28, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 29, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Ingress Throughput", + "panel_id": 18, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Egress Throughput", + "panel_id": 19, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Intra-Cluster Throughput", + "panel_id": 20, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Root Filesystem Usage", + "panel_id": 21, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Nodes Closest to Full Root Disks", + "panel_id": 22, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Problem Pods", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Stuck Terminating (>10m)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Control Plane Workloads", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Pods Not Running", + "panel_id": 5, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Terminating >10m", + "panel_id": 7, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Node Pod Share", + "panel_id": 8, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 9, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Namespace Plurality by Node v27", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Free", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Free", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Per-Node Usage", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Per-Node Usage", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage History", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage History", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Sweepers Ready", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Cron Freshness (s)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})" + ] + } +] diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py index 206dcd9..1e305cb 100644 --- a/scripts/knowledge_render_atlas.py +++ b/scripts/knowledge_render_atlas.py @@ -26,6 +26,7 @@ from typing import Any, Iterable import yaml REPO_ROOT = Path(__file__).resolve().parents[1] +DASHBOARD_DIR = REPO_ROOT / "services" / "monitoring" / "dashboards" CLUSTER_SCOPED_KINDS = { "Namespace", @@ -67,6 +68,64 @@ def _sync_tree(source: Path, dest: Path) -> None: shutil.copytree(source, dest) +def _iter_dashboard_panels(dashboard: dict[str, Any]) -> Iterable[dict[str, Any]]: + panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else [] + for panel in panels: + if not isinstance(panel, dict): + continue + if panel.get("type") == "row" and isinstance(panel.get("panels"), list): + yield from _iter_dashboard_panels({"panels": panel.get("panels")}) + continue + yield panel + + +def _extract_metrics_index(dashboard_dir: Path) -> list[dict[str, Any]]: + index: list[dict[str, Any]] = [] + for path in sorted(dashboard_dir.glob("*.json")): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + continue + if not isinstance(data, dict): + continue + dash_title = data.get("title") or path.stem + dash_tags = data.get("tags") or [] + for panel in _iter_dashboard_panels(data): + targets = panel.get("targets") + if not isinstance(targets, list): + continue + exprs: list[str] = [] + for target in targets: + if not isinstance(target, dict): + continue + expr = target.get("expr") + if isinstance(expr, str) and expr.strip(): + exprs.append(expr.strip()) + if not exprs: + continue + datasource = panel.get("datasource") or {} + if isinstance(datasource, dict): + ds_uid = datasource.get("uid") + ds_type = datasource.get("type") + else: + ds_uid = None + ds_type = None + index.append( + { + "dashboard": dash_title, + "panel_title": panel.get("title") or "", + "panel_id": panel.get("id"), + "panel_type": panel.get("type"), + "description": panel.get("description") or "", + "tags": dash_tags, + "datasource_uid": ds_uid, + "datasource_type": ds_type, + "exprs": exprs, + } + ) + return index + + def kustomize_build(path: Path) -> str: rel = path.relative_to(REPO_ROOT) try: @@ -516,6 +575,7 @@ def main() -> int: summary_path = out_dir / "catalog" / "atlas-summary.json" diagram_path = out_dir / "diagrams" / "atlas-http.mmd" runbooks_json_path = out_dir / "catalog" / "runbooks.json" + metrics_json_path = out_dir / "catalog" / "metrics.json" catalog_rel = catalog_path.relative_to(REPO_ROOT).as_posix() catalog_path.write_text( @@ -560,12 +620,17 @@ def main() -> int: } ) runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8") + metrics_index = _extract_metrics_index(DASHBOARD_DIR) + metrics_json_path.write_text( + json.dumps(metrics_index, indent=2, sort_keys=False) + "\n", encoding="utf-8" + ) print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}") print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}") print(f"Wrote {summary_path.relative_to(REPO_ROOT)}") print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}") print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}") + print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}") if args.sync_comms: comms_dir = REPO_ROOT / "services" / "comms" / "knowledge" diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 2170396..ecf478c 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -58,14 +58,14 @@ spec: args: - >- . /vault/secrets/portal-env.sh - && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 180 app:app + && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 600 app:app env: - name: AI_CHAT_API value: http://ollama.ai.svc.cluster.local:11434 - name: AI_CHAT_MODEL value: qwen2.5-coder:7b-instruct-q4_0 - name: AI_CHAT_TIMEOUT_SEC - value: "60" + value: "480" - name: AI_NODE_NAME valueFrom: fieldRef: diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml index 40d74fe..7209da6 100644 --- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml +++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml @@ -47,6 +47,8 @@ spec: env: - name: UPSTREAM_URL value: http://bstein-dev-home-backend/api/chat + - name: UPSTREAM_TIMEOUT_SEC + value: "600" ports: - name: http containerPort: 8080 diff --git a/services/bstein-dev-home/scripts/gateway.py b/services/bstein-dev-home/scripts/gateway.py index 3ca2fa1..19d3606 100644 --- a/services/bstein-dev-home/scripts/gateway.py +++ b/services/bstein-dev-home/scripts/gateway.py @@ -6,6 +6,7 @@ from urllib import request, error UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat") KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "") KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "") +UPSTREAM_TIMEOUT_SEC = float(os.environ.get("UPSTREAM_TIMEOUT_SEC", "90")) ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k} @@ -41,7 +42,7 @@ class Handler(BaseHTTPRequestHandler): headers={"Content-Type": "application/json"}, method="POST", ) - with request.urlopen(upstream_req, timeout=90) as resp: + with request.urlopen(upstream_req, timeout=UPSTREAM_TIMEOUT_SEC) as resp: data = resp.read() self.send_response(resp.status) for k, v in resp.headers.items(): diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 2c08853..031abb8 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-12 + checksum/atlasbot-configmap: manual-atlasbot-13 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -83,6 +83,10 @@ spec: value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL value: qwen2.5-coder:7b-instruct-q4_0 + - name: OLLAMA_TIMEOUT_SEC + value: "480" + - name: ATLASBOT_THINKING_INTERVAL_SEC + value: "120" resources: requests: cpu: 100m @@ -114,6 +118,8 @@ spec: path: catalog/atlas.json - key: atlas-summary.json path: catalog/atlas-summary.json + - key: metrics.json + path: catalog/metrics.json - key: runbooks.json path: catalog/runbooks.json - key: atlas-http.mmd diff --git a/services/comms/knowledge/catalog/metrics.json b/services/comms/knowledge/catalog/metrics.json new file mode 100644 index 0000000..e929db5 --- /dev/null +++ b/services/comms/knowledge/catalog/metrics.json @@ -0,0 +1,1880 @@ +[ + { + "dashboard": "Atlas GPU", + "panel_title": "Namespace GPU Share", + "panel_id": 1, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Namespace", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Node", + "panel_id": 3, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "Top Pods by GPU Util", + "panel_id": 4, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (range)", + "panel_id": 1, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 3, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Stale (>36h)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Missing Success", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Suspended", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (24h)", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Runs (1h)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Error (hours ago)", + "panel_id": 10, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Success (hours ago)", + "panel_id": 11, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Success (hours ago)", + "panel_id": 12, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Schedule (hours ago)", + "panel_id": 13, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 14, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (30d)", + "panel_id": 15, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Access Requests", + "panel_id": 16, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(ariadne_access_requests_total)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Coverage (%)", + "panel_id": 17, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_coverage_percent{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Tests (latest)", + "panel_id": 18, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_tests_total{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (7d)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"7d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Mail Bounces (1d)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Success Rate (1d)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Limit Used (30d)", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Send Limit (30d)", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Last Success", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_last_success_timestamp_seconds)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounce Rate (1d vs 7d)", + "panel_id": 13, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounce_rate)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounced (1d vs 7d)", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounced)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d vs 7d)", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_sent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Success Rate (5m)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (1h)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (6h)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Edge P99 Latency (ms)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Traffic", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Egress Traffic", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Intra-Cluster Traffic", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Per-Node Throughput", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Namespaces", + "panel_id": 9, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Pods", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Routers (req/s)", + "panel_id": 11, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Entrypoints (req/s)", + "panel_id": 12, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Worker Nodes Ready", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server 5xx rate", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server P99 latency", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "etcd P99 latency", + "panel_id": 11, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node CPU", + "panel_id": 4, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node RAM", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) CPU", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) RAM", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Root Filesystem Usage", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Stuck Terminating", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Atlas Availability", + "panel_id": 27, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Problem Pods", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Workers Ready", + "panel_id": 1, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: CPU", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: RAM", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: NET (rx+tx)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: I/O (r+w)", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Sent (1d)", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Bounces (1d)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Success Rate (1d)", + "panel_id": 32, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Limit Used (30d)", + "panel_id": 33, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Connections Used", + "panel_id": 34, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Hottest Connections", + "panel_id": 35, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(1, sum by (datname) (pg_stat_activity_count))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Usage", + "panel_id": 23, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Usage", + "panel_id": 24, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Free", + "panel_id": 25, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Free", + "panel_id": 26, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 40, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 41, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Test Success Rate", + "panel_id": 42, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Tests with Failures (24h)", + "panel_id": 43, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace CPU Share", + "panel_id": 11, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace GPU Share", + "panel_id": 12, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace RAM Share", + "panel_id": 13, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node CPU", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node RAM", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane CPU", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane RAM", + "panel_id": 17, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Node Pod Share", + "panel_id": 28, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 29, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Ingress Throughput", + "panel_id": 18, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Egress Throughput", + "panel_id": 19, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Intra-Cluster Throughput", + "panel_id": 20, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Root Filesystem Usage", + "panel_id": 21, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Nodes Closest to Full Root Disks", + "panel_id": 22, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Problem Pods", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Stuck Terminating (>10m)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Control Plane Workloads", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Pods Not Running", + "panel_id": 5, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Terminating >10m", + "panel_id": 7, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Node Pod Share", + "panel_id": 8, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 9, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Namespace Plurality by Node v27", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Free", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Free", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Per-Node Usage", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Per-Node Usage", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage History", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage History", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Sweepers Ready", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Cron Freshness (s)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})" + ] + } +] diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml index 3360067..37f681d 100644 --- a/services/comms/kustomization.yaml +++ b/services/comms/kustomization.yaml @@ -73,5 +73,6 @@ configMapGenerator: - INDEX.md=knowledge/INDEX.md - atlas.json=knowledge/catalog/atlas.json - atlas-summary.json=knowledge/catalog/atlas-summary.json + - metrics.json=knowledge/catalog/metrics.json - runbooks.json=knowledge/catalog/runbooks.json - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 8edc28d..e604e65 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -17,7 +17,7 @@ ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") API_KEY = os.environ.get("CHAT_API_KEY", "") -OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "90")) +OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") @@ -29,6 +29,7 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500")) +THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120")) TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE) HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)") @@ -59,8 +60,21 @@ STOPWORDS = { } METRIC_HINT_WORDS = { + "bandwidth", + "connections", + "cpu", + "database", + "db", + "disk", "health", + "memory", + "network", + "node", + "nodes", + "postgres", "status", + "storage", + "usage", "down", "slow", "error", @@ -157,6 +171,7 @@ def send_msg(token: str, room: str, text: str): KB = {"catalog": {}, "runbooks": []} _HOST_INDEX: dict[str, list[dict]] = {} _NAME_INDEX: set[str] = set() +_METRIC_INDEX: list[dict[str, Any]] = [] _NODE_CLASS_INDEX: dict[str, list[str]] = {} _NODE_CLASS_RPI4: set[str] = set() _NODE_CLASS_RPI5: set[str] = set() @@ -180,6 +195,7 @@ def load_kb(): return catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {} runbooks = _load_json_file(os.path.join(KB_DIR, "catalog", "runbooks.json")) or [] + metrics = _load_json_file(os.path.join(KB_DIR, "catalog", "metrics.json")) or [] KB = {"catalog": catalog, "runbooks": runbooks} host_index: dict[str, list[dict]] = collections.defaultdict(list) @@ -197,6 +213,7 @@ def load_kb(): if isinstance(w, dict) and w.get("name"): names.add(str(w["name"]).lower()) _NAME_INDEX = names + _METRIC_INDEX = metrics if isinstance(metrics, list) else [] node_classes = _parse_node_classes(runbooks) _NODE_CLASS_INDEX = node_classes @@ -356,6 +373,65 @@ def node_inventory_context(query: str) -> str: return "" return "\n".join(lines) +def _metric_tokens(entry: dict[str, Any]) -> str: + parts: list[str] = [] + for key in ("panel_title", "dashboard", "description"): + val = entry.get(key) + if isinstance(val, str) and val: + parts.append(val.lower()) + tags = entry.get("tags") + if isinstance(tags, list): + parts.extend(str(t).lower() for t in tags if t) + return " ".join(parts) + +def metrics_lookup(query: str, limit: int = 3) -> list[dict[str, Any]]: + q_tokens = _tokens(query) + if not q_tokens or not _METRIC_INDEX: + return [] + scored: list[tuple[int, dict[str, Any]]] = [] + for entry in _METRIC_INDEX: + if not isinstance(entry, dict): + continue + hay = _metric_tokens(entry) + if not hay: + continue + score = 0 + for t in set(q_tokens): + if t in hay: + score += 2 if t in (entry.get("panel_title") or "").lower() else 1 + if score: + scored.append((score, entry)) + scored.sort(key=lambda item: item[0], reverse=True) + return [entry for _, entry in scored[:limit]] + +def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: + if not allow_tools: + return "", "" + lower = (prompt or "").lower() + if not any(word in lower for word in METRIC_HINT_WORDS): + return "", "" + matches = metrics_lookup(prompt, limit=1) + if not matches: + return "", "" + entry = matches[0] + exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else [] + if not exprs: + return "", "" + rendered_parts: list[str] = [] + for expr in exprs[:2]: + res = vm_query(expr, timeout=20) + rendered = vm_render_result(res, limit=10) + if rendered: + rendered_parts.append(rendered) + if not rendered_parts: + return "", "" + dashboard = entry.get("dashboard") or "dashboard" + panel = entry.get("panel_title") or "panel" + summary = "\n".join(rendered_parts) + context = f"Metrics (from {dashboard} / {panel}):\n{summary}" + fallback = f"{panel}: {summary}" + return context, fallback + def jetson_nodes_from_kb() -> list[str]: for doc in KB.get("runbooks", []): if not isinstance(doc, dict): @@ -777,6 +853,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "Be helpful, direct, and concise. " "Prefer answering with exact repo paths and Kubernetes resource names. " "Never include or request secret values. " + "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " "If the answer is not grounded in the provided context or tool data, say you do not know." ) @@ -820,7 +897,17 @@ def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, thread.start() if not done.wait(2.0): send_msg(token, room, "Thinking…") - done.wait() + prompt_hint = " ".join((prompt or "").split()) + if len(prompt_hint) > 160: + prompt_hint = prompt_hint[:157] + "…" + heartbeat = max(10, THINKING_INTERVAL_SEC) + next_heartbeat = time.monotonic() + heartbeat + while not done.wait(max(0, next_heartbeat - time.monotonic())): + if prompt_hint: + send_msg(token, room, f"Still thinking about: {prompt_hint} (gathering context)") + else: + send_msg(token, room, "Still thinking (gathering context)…") + next_heartbeat += heartbeat thread.join(timeout=1) return result["reply"] or fallback or "Model backend is busy. Try again in a moment." @@ -937,9 +1024,15 @@ def sync_loop(token: str, room_id: str): rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra + metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_tools) + if metrics_context: + context = (context + "\n\n" + metrics_context).strip() if context else metrics_context + fallback = "" if "node" in lower_body or "cluster" in lower_body: fallback = node_inventory_answer("Atlas", lower_body) + if metrics_fallback and not fallback: + fallback = metrics_fallback reply = ollama_reply_with_thinking( token, rid, From 0ac0f920ca0aa48d43b4e2118df6913a7f13e7a3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 15:34:52 -0300 Subject: [PATCH 240/416] atlasbot: load metrics index and answer in rooms --- services/comms/scripts/atlasbot/bot.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e604e65..ff9019e 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -188,7 +188,7 @@ def _load_json_file(path: str) -> Any | None: return None def load_kb(): - global KB, _HOST_INDEX, _NAME_INDEX + global KB, _HOST_INDEX, _NAME_INDEX, _METRIC_INDEX global _NODE_CLASS_INDEX, _NODE_CLASS_RPI4, _NODE_CLASS_RPI5, _NODE_CLASS_AMD64, _NODE_CLASS_JETSON global _NODE_CLASS_EXTERNAL, _NODE_CLASS_NON_RPI if not KB_DIR: @@ -414,6 +414,8 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: if not matches: return "", "" entry = matches[0] + dashboard = entry.get("dashboard") or "dashboard" + panel = entry.get("panel_title") or "panel" exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else [] if not exprs: return "", "" @@ -424,9 +426,7 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: if rendered: rendered_parts.append(rendered) if not rendered_parts: - return "", "" - dashboard = entry.get("dashboard") or "dashboard" - panel = entry.get("panel_title") or "panel" + return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data." summary = "\n".join(rendered_parts) context = f"Metrics (from {dashboard} / {panel}):\n{summary}" fallback = f"{panel}: {summary}" @@ -998,8 +998,9 @@ def sync_loop(token: str, room_id: str): send_msg(token, rid, names_summary) continue - # Only do live cluster/metrics introspection in DMs. + # Only do live cluster introspection in DMs; metrics can be answered when mentioned. allow_tools = is_dm + allow_metrics = is_dm or mentioned promql = "" if allow_tools: @@ -1024,7 +1025,7 @@ def sync_loop(token: str, room_id: str): rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra - metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_tools) + metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics) if metrics_context: context = (context + "\n\n" + metrics_context).strip() if context else metrics_context From be7846572f1929e614890d3fd8ddc9e94a2073b5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 15:54:00 -0300 Subject: [PATCH 241/416] atlasbot: recognize prefix mentions --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 031abb8..aa91dcb 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -78,7 +78,7 @@ spec: - name: BOT_USER value: atlasbot - name: BOT_MENTIONS - value: atlasbot + value: atlasbot,aatlasbot - name: OLLAMA_URL value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index ff9019e..f4182cd 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -119,9 +119,21 @@ def normalize_user_id(token: str) -> str: MENTION_USER_IDS = {normalize_user_id(t).lower() for t in MENTION_TOKENS if normalize_user_id(t)} +def _body_mentions_token(body: str) -> bool: + lower = (body or "").strip().lower() + if not lower: + return False + for token in MENTION_LOCALPARTS: + for prefix in (token, f"@{token}"): + if lower.startswith(prefix + ":") or lower.startswith(prefix + ",") or lower.startswith(prefix + " "): + return True + return False + def is_mentioned(content: dict, body: str) -> bool: if MENTION_RE.search(body or "") is not None: return True + if _body_mentions_token(body or ""): + return True mentions = content.get("m.mentions", {}) user_ids = mentions.get("user_ids", []) if not isinstance(user_ids, list): From 7bb1bd96fca67dad9f5c5c2956efc327d1ef30fd Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 18:16:14 -0300 Subject: [PATCH 242/416] atlasbot: improve worker readiness and metrics replies --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 140 +++++++++++++++++++++++- 2 files changed, 140 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index aa91dcb..d5d8f06 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-13 + checksum/atlasbot-configmap: manual-atlasbot-14 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f4182cd..57549b3 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -441,7 +441,7 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data." summary = "\n".join(rendered_parts) context = f"Metrics (from {dashboard} / {panel}):\n{summary}" - fallback = f"{panel}: {summary}" + fallback = _metrics_fallback_summary(panel, summary) return context, fallback def jetson_nodes_from_kb() -> list[str]: @@ -654,6 +654,115 @@ def vm_render_result(res: dict | None, limit: int = 12) -> str: out.append(f"- {labels}: {val}") return "\n".join(out) +def _parse_metric_lines(summary: str) -> dict[str, str]: + parsed: dict[str, str] = {} + for line in (summary or "").splitlines(): + line = line.strip() + if not line.startswith("-"): + continue + try: + label, value = line.lstrip("-").split(":", 1) + except ValueError: + continue + parsed[label.strip()] = value.strip() + return parsed + +def _metrics_fallback_summary(panel: str, summary: str) -> str: + parsed = _parse_metric_lines(summary) + panel_l = (panel or "").lower() + if panel_l.startswith("postgres connections"): + used = parsed.get("conn=used") + maxv = parsed.get("conn=max") + if used and maxv: + try: + used_i = int(float(used)) + max_i = int(float(maxv)) + except ValueError: + return f"Postgres connections: {summary}" + free = max_i - used_i + return f"Postgres connections: {used_i}/{max_i} used ({free} free)." + if panel_l.startswith("postgres hottest"): + if parsed: + label, value = next(iter(parsed.items())) + return f"Most Postgres connections: {label} = {value}." + return f"{panel}: {summary}" + +def _node_ready_status(node: dict) -> bool | None: + conditions = node.get("status", {}).get("conditions") or [] + for cond in conditions if isinstance(conditions, list) else []: + if cond.get("type") == "Ready": + if cond.get("status") == "True": + return True + if cond.get("status") == "False": + return False + return None + return None + +def _node_is_worker(node: dict) -> bool: + labels = (node.get("metadata") or {}).get("labels") or {} + if labels.get("node-role.kubernetes.io/control-plane") is not None: + return False + if labels.get("node-role.kubernetes.io/master") is not None: + return False + if labels.get("node-role.kubernetes.io/worker") is not None: + return True + return True + +def worker_nodes_status() -> tuple[list[str], list[str]]: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return ([], []) + items = data.get("items") or [] + ready_nodes: list[str] = [] + not_ready_nodes: list[str] = [] + for node in items if isinstance(items, list) else []: + if not _node_is_worker(node): + continue + name = (node.get("metadata") or {}).get("name") or "" + if not name: + continue + ready = _node_ready_status(node) + if ready is True: + ready_nodes.append(name) + elif ready is False: + not_ready_nodes.append(name) + return (sorted(ready_nodes), sorted(not_ready_nodes)) + +def expected_nodes_from_kb() -> set[str]: + if not _NODE_CLASS_INDEX: + return set() + nodes = set().union(*_NODE_CLASS_INDEX.values()) + return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL} + +def missing_nodes_answer(cluster_name: str) -> str: + expected = expected_nodes_from_kb() + if not expected: + return "" + current = set() + try: + data = k8s_get("/api/v1/nodes?limit=500") + items = data.get("items") or [] + for node in items if isinstance(items, list) else []: + name = (node.get("metadata") or {}).get("name") or "" + if name: + current.add(name) + except Exception: + return "" + missing = sorted(expected - current) + if not missing: + return f"{cluster_name}: no missing nodes versus KB inventory." + return f"{cluster_name} missing nodes versus KB inventory: {', '.join(missing)}." + +def _should_short_circuit(prompt: str, fallback: str) -> bool: + if not fallback: + return False + lower = (prompt or "").lower() + for word in ("why", "explain", "architecture", "breakdown", "root cause", "plan"): + if word in lower: + return False + return True + def vm_top_restarts(hours: int = 1) -> str: q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))" res = vm_query(q) @@ -984,6 +1093,32 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue + if "worker" in lower_body and "node" in lower_body: + ready_nodes, not_ready_nodes = worker_nodes_status() + total = len(ready_nodes) + len(not_ready_nodes) + if total: + if any(word in lower_body for word in ("ready", "not ready", "unready")): + if not_ready_nodes: + send_msg( + token, + rid, + f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.", + ) + else: + send_msg(token, rid, f"All {len(ready_nodes)} worker nodes are Ready.") + continue + if any(word in lower_body for word in ("how many", "should")): + send_msg( + token, + rid, + f"Atlas has {total} worker nodes; {len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady.", + ) + continue + if "missing" in lower_body and "node" in lower_body: + missing = missing_nodes_answer("Atlas") + if missing: + send_msg(token, rid, missing) + continue inventory_answer = node_inventory_answer("Atlas", lower_body) if inventory_answer: send_msg(token, rid, inventory_answer) @@ -1046,6 +1181,9 @@ def sync_loop(token: str, room_id: str): fallback = node_inventory_answer("Atlas", lower_body) if metrics_fallback and not fallback: fallback = metrics_fallback + if _should_short_circuit(body, fallback): + send_msg(token, rid, fallback) + continue reply = ollama_reply_with_thinking( token, rid, From 65781aaca72cc10017a5f486d5a9d68d97dfec49 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 18:18:42 -0300 Subject: [PATCH 243/416] atlasbot: improve worker node answers --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 16 +++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d5d8f06..69aef2f 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-14 + checksum/atlasbot-configmap: manual-atlasbot-15 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 57549b3..3b9082d 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1097,6 +1097,7 @@ def sync_loop(token: str, room_id: str): ready_nodes, not_ready_nodes = worker_nodes_status() total = len(ready_nodes) + len(not_ready_nodes) if total: + missing_hint = missing_nodes_answer("Atlas") if any(word in lower_body for word in ("ready", "not ready", "unready")): if not_ready_nodes: send_msg( @@ -1105,14 +1106,19 @@ def sync_loop(token: str, room_id: str): f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.", ) else: - send_msg(token, rid, f"All {len(ready_nodes)} worker nodes are Ready.") + msg = f"All {len(ready_nodes)} worker nodes are Ready." + if missing_hint and "no missing" not in missing_hint: + msg += f" {missing_hint}" + send_msg(token, rid, msg) continue if any(word in lower_body for word in ("how many", "should")): - send_msg( - token, - rid, - f"Atlas has {total} worker nodes; {len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady.", + msg = ( + f"Atlas has {total} worker nodes; " + f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady." ) + if missing_hint and "no missing" not in missing_hint: + msg += f" {missing_hint}" + send_msg(token, rid, msg) continue if "missing" in lower_body and "node" in lower_body: missing = missing_nodes_answer("Atlas") From dfa13e22cc65712457ccde2e910b50d201db147f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 18:21:17 -0300 Subject: [PATCH 244/416] atlasbot: clarify worker count limits --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 69aef2f..802021f 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-15 + checksum/atlasbot-configmap: manual-atlasbot-16 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 3b9082d..7153723 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1118,6 +1118,8 @@ def sync_loop(token: str, room_id: str): ) if missing_hint and "no missing" not in missing_hint: msg += f" {missing_hint}" + elif "should" in lower_body: + msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state." send_msg(token, rid, msg) continue if "missing" in lower_body and "node" in lower_body: From 0d5e19e11adb38fcb5a042a08306675cb4439885 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 18:50:23 -0300 Subject: [PATCH 245/416] atlasbot: infer worker expected count from metrics --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 33 +++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 802021f..b7843ab 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-16 + checksum/atlasbot-configmap: manual-atlasbot-17 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7153723..bd40a9f 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -191,6 +191,7 @@ _NODE_CLASS_AMD64: set[str] = set() _NODE_CLASS_JETSON: set[str] = set() _NODE_CLASS_EXTERNAL: set[str] = set() _NODE_CLASS_NON_RPI: set[str] = set() +NODE_REGEX = re.compile(r'node=~"([^"]+)"') def _load_json_file(path: str) -> Any | None: try: @@ -735,6 +736,23 @@ def expected_nodes_from_kb() -> set[str]: nodes = set().union(*_NODE_CLASS_INDEX.values()) return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL} +def expected_worker_nodes_from_metrics() -> list[str]: + for entry in _METRIC_INDEX: + panel = (entry.get("panel_title") or "").lower() + if "worker nodes ready" not in panel: + continue + exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else [] + for expr in exprs: + if not isinstance(expr, str): + continue + match = NODE_REGEX.search(expr) + if not match: + continue + raw = match.group(1) + nodes = [n.strip() for n in raw.split("|") if n.strip()] + return sorted(nodes) + return [] + def missing_nodes_answer(cluster_name: str) -> str: expected = expected_nodes_from_kb() if not expected: @@ -1098,6 +1116,8 @@ def sync_loop(token: str, room_id: str): total = len(ready_nodes) + len(not_ready_nodes) if total: missing_hint = missing_nodes_answer("Atlas") + expected_workers = expected_worker_nodes_from_metrics() + expected_total = len(expected_workers) if expected_workers else 0 if any(word in lower_body for word in ("ready", "not ready", "unready")): if not_ready_nodes: send_msg( @@ -1107,7 +1127,11 @@ def sync_loop(token: str, room_id: str): ) else: msg = f"All {len(ready_nodes)} worker nodes are Ready." - if missing_hint and "no missing" not in missing_hint: + if expected_total and len(ready_nodes) != expected_total: + missing = sorted(set(expected_workers) - set(ready_nodes)) + if missing: + msg += f" Missing: {', '.join(missing)}." + elif missing_hint and "no missing" not in missing_hint: msg += f" {missing_hint}" send_msg(token, rid, msg) continue @@ -1116,7 +1140,12 @@ def sync_loop(token: str, room_id: str): f"Atlas has {total} worker nodes; " f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady." ) - if missing_hint and "no missing" not in missing_hint: + if expected_total: + msg += f" Grafana inventory expects {expected_total} workers." + missing = sorted(set(expected_workers) - set(ready_nodes)) + if missing: + msg += f" Missing: {', '.join(missing)}." + elif missing_hint and "no missing" not in missing_hint: msg += f" {missing_hint}" elif "should" in lower_body: msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state." From b6e8c01e99171d09dae7af3d719c2803ae20e257 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:01:26 -0300 Subject: [PATCH 246/416] atlasbot: improve missing node inference --- services/comms/scripts/atlasbot/bot.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index bd40a9f..7eb6dc7 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -754,6 +754,15 @@ def expected_worker_nodes_from_metrics() -> list[str]: return [] def missing_nodes_answer(cluster_name: str) -> str: + expected_workers = expected_worker_nodes_from_metrics() + if expected_workers: + ready_nodes, not_ready_nodes = worker_nodes_status() + current_workers = set(ready_nodes + not_ready_nodes) + missing = sorted(set(expected_workers) - current_workers) + if not missing: + return f"{cluster_name}: no missing worker nodes versus Grafana inventory." + return f"{cluster_name} missing worker nodes versus Grafana inventory: {', '.join(missing)}." + expected = expected_nodes_from_kb() if not expected: return "" @@ -1173,7 +1182,7 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue - if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body): + if re.search(r"\bnode names?\b|\bnodes?\b.*\bnamed\b|\bnaming\b", lower_body): if any(word in lower_body for word in ("cluster", "atlas", "titan")): names_summary = nodes_names_summary("Atlas") if not names_summary: @@ -1181,6 +1190,14 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, names_summary) continue + if re.search(r"\bwhich nodes are ready\b|\bnodes ready\b", lower_body): + ready_nodes, not_ready_nodes = worker_nodes_status() + if ready_nodes: + msg = f"Ready worker nodes ({len(ready_nodes)}): {', '.join(ready_nodes)}." + if not_ready_nodes: + msg += f" Not Ready: {', '.join(not_ready_nodes)}." + send_msg(token, rid, msg) + continue # Only do live cluster introspection in DMs; metrics can be answered when mentioned. allow_tools = is_dm From d666e6a156389733dd980c07cbcc1a175e1c152d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:02:54 -0300 Subject: [PATCH 247/416] atlasbot: roll deployment --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index b7843ab..e45d9f3 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-17 + checksum/atlasbot-configmap: manual-atlasbot-18 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From ff04341559ea7c074731628ecb7d6b8825689108 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:22:28 -0300 Subject: [PATCH 248/416] atlasbot: use live node inventory context --- services/comms/scripts/atlasbot/bot.py | 320 +++++++------------------ 1 file changed, 89 insertions(+), 231 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7eb6dc7..e070ead 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -184,13 +184,6 @@ KB = {"catalog": {}, "runbooks": []} _HOST_INDEX: dict[str, list[dict]] = {} _NAME_INDEX: set[str] = set() _METRIC_INDEX: list[dict[str, Any]] = [] -_NODE_CLASS_INDEX: dict[str, list[str]] = {} -_NODE_CLASS_RPI4: set[str] = set() -_NODE_CLASS_RPI5: set[str] = set() -_NODE_CLASS_AMD64: set[str] = set() -_NODE_CLASS_JETSON: set[str] = set() -_NODE_CLASS_EXTERNAL: set[str] = set() -_NODE_CLASS_NON_RPI: set[str] = set() NODE_REGEX = re.compile(r'node=~"([^"]+)"') def _load_json_file(path: str) -> Any | None: @@ -202,8 +195,6 @@ def _load_json_file(path: str) -> Any | None: def load_kb(): global KB, _HOST_INDEX, _NAME_INDEX, _METRIC_INDEX - global _NODE_CLASS_INDEX, _NODE_CLASS_RPI4, _NODE_CLASS_RPI5, _NODE_CLASS_AMD64, _NODE_CLASS_JETSON - global _NODE_CLASS_EXTERNAL, _NODE_CLASS_NON_RPI if not KB_DIR: return catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {} @@ -228,24 +219,6 @@ def load_kb(): _NAME_INDEX = names _METRIC_INDEX = metrics if isinstance(metrics, list) else [] - node_classes = _parse_node_classes(runbooks) - _NODE_CLASS_INDEX = node_classes - _NODE_CLASS_RPI4 = set(node_classes.get("rpi4", [])) - _NODE_CLASS_RPI5 = set(node_classes.get("rpi5", [])) - _NODE_CLASS_AMD64 = set(node_classes.get("amd64", [])) - _NODE_CLASS_JETSON = set(node_classes.get("jetson", [])) - _NODE_CLASS_EXTERNAL = set(node_classes.get("external", [])) - _NODE_CLASS_NON_RPI = set( - sorted( - ( - set().union(*node_classes.values()) - - _NODE_CLASS_RPI4 - - _NODE_CLASS_RPI5 - - _NODE_CLASS_EXTERNAL - ) - ) - ) - def kb_retrieve(query: str, *, limit: int = 3) -> str: q = (query or "").strip() if not q or not KB.get("runbooks"): @@ -309,81 +282,92 @@ def _extract_titan_nodes(text: str) -> list[str]: names.add(f"titan-{right.lower()}") return sorted(names) -def _parse_node_classes(runbooks: list[dict[str, Any]]) -> dict[str, list[str]]: - classes: dict[str, list[str]] = {} - for doc in runbooks: - if not isinstance(doc, dict): - continue - body = str(doc.get("body") or "") - for line in body.splitlines(): - stripped = line.strip() - if "titan-" not in stripped.lower(): - continue - label = "" - nodes: list[str] = [] - if stripped.startswith("-") and ":" in stripped: - label, rest = stripped.lstrip("-").split(":", 1) - nodes = _extract_titan_nodes(rest) - label = label.strip().lower() - else: - nodes = _extract_titan_nodes(stripped) - if not nodes: - continue - if "jetson" in stripped.lower(): - classes.setdefault("jetson", nodes) - if "amd64" in stripped.lower() or "x86" in stripped.lower(): - classes.setdefault("amd64", nodes) - if "rpi4" in stripped.lower(): - classes.setdefault("rpi4", nodes) - if "rpi5" in stripped.lower(): - classes.setdefault("rpi5", nodes) - if "external" in stripped.lower() or "non-cluster" in stripped.lower(): - classes.setdefault("external", nodes) - if label: - classes.setdefault(label, nodes) - return {k: sorted(set(v)) for k, v in classes.items()} +def _node_roles(labels: dict[str, Any]) -> list[str]: + roles: list[str] = [] + for key in labels.keys(): + if key.startswith("node-role.kubernetes.io/"): + role = key.split("/", 1)[-1] + if role: + roles.append(role) + return sorted(set(roles)) -def node_inventory_answer(cluster_name: str, query: str) -> str: - q = (query or "").lower() - if "jetson" in q and _NODE_CLASS_JETSON: - names = sorted(_NODE_CLASS_JETSON) - return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}." - if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: - names = sorted(_NODE_CLASS_NON_RPI) - if names: - return f"{cluster_name} non‑Raspberry Pi nodes: {', '.join(names)}." - if "raspberry" in q or "rpi" in q: - if "rpi4" in q and _NODE_CLASS_RPI4: - names = sorted(_NODE_CLASS_RPI4) - return f"{cluster_name} rpi4 nodes: {', '.join(names)}." - if "rpi5" in q and _NODE_CLASS_RPI5: - names = sorted(_NODE_CLASS_RPI5) - return f"{cluster_name} rpi5 nodes: {', '.join(names)}." - names = sorted(_NODE_CLASS_RPI4 | _NODE_CLASS_RPI5) - if names: - return f"{cluster_name} Raspberry Pi nodes: {', '.join(names)}." - if ("amd64" in q or "x86" in q) and _NODE_CLASS_AMD64: - names = sorted(_NODE_CLASS_AMD64) - return f"{cluster_name} amd64 nodes: {', '.join(names)}." - return "" +def _hardware_class(labels: dict[str, Any]) -> str: + if str(labels.get("jetson") or "").lower() == "true": + return "jetson" + hardware = (labels.get("hardware") or "").strip().lower() + if hardware in ("rpi4", "rpi5"): + return hardware + arch = labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "" + if arch == "amd64": + return "amd64" + if arch == "arm64": + return "arm64-unknown" + return "unknown" + +def node_inventory_live() -> list[dict[str, Any]]: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return [] + items = data.get("items") or [] + inventory: list[dict[str, Any]] = [] + for node in items if isinstance(items, list) else []: + meta = node.get("metadata") or {} + labels = meta.get("labels") or {} + name = meta.get("name") or "" + if not name: + continue + inventory.append( + { + "name": name, + "arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "", + "hardware": _hardware_class(labels), + "roles": _node_roles(labels), + "ready": _node_ready_status(node), + } + ) + return sorted(inventory, key=lambda item: item["name"]) + +def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: + grouped: dict[str, list[str]] = collections.defaultdict(list) + for node in inventory: + grouped[node.get("hardware") or "unknown"].append(node["name"]) + return {k: sorted(v) for k, v in grouped.items()} def node_inventory_context(query: str) -> str: q = (query or "").lower() - if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "x86", "cluster")): + if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")): return "" - lines: list[str] = ["Node inventory (KB):"] - if _NODE_CLASS_RPI5: - lines.append(f"- rpi5: {', '.join(sorted(_NODE_CLASS_RPI5))}") - if _NODE_CLASS_RPI4: - lines.append(f"- rpi4: {', '.join(sorted(_NODE_CLASS_RPI4))}") - if _NODE_CLASS_JETSON: - lines.append(f"- jetson: {', '.join(sorted(_NODE_CLASS_JETSON))}") - if _NODE_CLASS_AMD64: - lines.append(f"- amd64: {', '.join(sorted(_NODE_CLASS_AMD64))}") - if _NODE_CLASS_EXTERNAL: - lines.append(f"- external: {', '.join(sorted(_NODE_CLASS_EXTERNAL))}") - if len(lines) == 1: + inventory = node_inventory_live() + if not inventory: return "" + groups = _group_nodes(inventory) + total = len(inventory) + ready = sum(1 for node in inventory if node.get("ready") is True) + not_ready = sum(1 for node in inventory if node.get("ready") is False) + lines: list[str] = [ + "Node inventory (live):", + f"- total: {total}, ready: {ready}, not ready: {not_ready}", + ] + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + if key in groups: + lines.append(f"- {key}: {', '.join(groups[key])}") + non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) + if non_rpi: + lines.append(f"- non_raspberry_pi (derived): {', '.join(non_rpi)}") + unknowns = groups.get("arm64-unknown", []) + groups.get("unknown", []) + if unknowns: + lines.append("- note: nodes labeled arm64-unknown/unknown may still be Raspberry Pi unless tagged.") + expected_workers = expected_worker_nodes_from_metrics() + if expected_workers: + ready_workers, not_ready_workers = worker_nodes_status() + missing = sorted(set(expected_workers) - set(ready_workers + not_ready_workers)) + lines.append(f"- expected_workers (grafana): {', '.join(expected_workers)}") + lines.append(f"- workers_ready: {', '.join(ready_workers)}") + if not_ready_workers: + lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}") + if missing: + lines.append(f"- workers_missing (derived): {', '.join(missing)}") return "\n".join(lines) def _metric_tokens(entry: dict[str, Any]) -> str: @@ -730,12 +714,6 @@ def worker_nodes_status() -> tuple[list[str], list[str]]: not_ready_nodes.append(name) return (sorted(ready_nodes), sorted(not_ready_nodes)) -def expected_nodes_from_kb() -> set[str]: - if not _NODE_CLASS_INDEX: - return set() - nodes = set().union(*_NODE_CLASS_INDEX.values()) - return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL} - def expected_worker_nodes_from_metrics() -> list[str]: for entry in _METRIC_INDEX: panel = (entry.get("panel_title") or "").lower() @@ -753,42 +731,13 @@ def expected_worker_nodes_from_metrics() -> list[str]: return sorted(nodes) return [] -def missing_nodes_answer(cluster_name: str) -> str: - expected_workers = expected_worker_nodes_from_metrics() - if expected_workers: - ready_nodes, not_ready_nodes = worker_nodes_status() - current_workers = set(ready_nodes + not_ready_nodes) - missing = sorted(set(expected_workers) - current_workers) - if not missing: - return f"{cluster_name}: no missing worker nodes versus Grafana inventory." - return f"{cluster_name} missing worker nodes versus Grafana inventory: {', '.join(missing)}." - - expected = expected_nodes_from_kb() - if not expected: +def _context_fallback(context: str) -> str: + if not context: return "" - current = set() - try: - data = k8s_get("/api/v1/nodes?limit=500") - items = data.get("items") or [] - for node in items if isinstance(items, list) else []: - name = (node.get("metadata") or {}).get("name") or "" - if name: - current.add(name) - except Exception: - return "" - missing = sorted(expected - current) - if not missing: - return f"{cluster_name}: no missing nodes versus KB inventory." - return f"{cluster_name} missing nodes versus KB inventory: {', '.join(missing)}." - -def _should_short_circuit(prompt: str, fallback: str) -> bool: - if not fallback: - return False - lower = (prompt or "").lower() - for word in ("why", "explain", "architecture", "breakdown", "root cause", "plan"): - if word in lower: - return False - return True + trimmed = context.strip() + if len(trimmed) > MAX_TOOL_CHARS: + trimmed = trimmed[: MAX_TOOL_CHARS - 3].rstrip() + "..." + return "I couldn’t reach the model backend. Here is the data I found:\n" + trimmed def vm_top_restarts(hours: int = 1) -> str: q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))" @@ -1112,92 +1061,6 @@ def sync_loop(token: str, room_id: str): continue lower_body = body.lower() - if re.search(r"\bhow many nodes\b|\bnode count\b|\bnumber of nodes\b", lower_body): - if any(word in lower_body for word in ("cluster", "atlas", "titan")): - summary = nodes_summary("Atlas") - if not summary: - send_msg(token, rid, "I couldn’t reach the cluster API to count nodes. Try again in a moment.") - continue - send_msg(token, rid, summary) - continue - if "worker" in lower_body and "node" in lower_body: - ready_nodes, not_ready_nodes = worker_nodes_status() - total = len(ready_nodes) + len(not_ready_nodes) - if total: - missing_hint = missing_nodes_answer("Atlas") - expected_workers = expected_worker_nodes_from_metrics() - expected_total = len(expected_workers) if expected_workers else 0 - if any(word in lower_body for word in ("ready", "not ready", "unready")): - if not_ready_nodes: - send_msg( - token, - rid, - f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.", - ) - else: - msg = f"All {len(ready_nodes)} worker nodes are Ready." - if expected_total and len(ready_nodes) != expected_total: - missing = sorted(set(expected_workers) - set(ready_nodes)) - if missing: - msg += f" Missing: {', '.join(missing)}." - elif missing_hint and "no missing" not in missing_hint: - msg += f" {missing_hint}" - send_msg(token, rid, msg) - continue - if any(word in lower_body for word in ("how many", "should")): - msg = ( - f"Atlas has {total} worker nodes; " - f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady." - ) - if expected_total: - msg += f" Grafana inventory expects {expected_total} workers." - missing = sorted(set(expected_workers) - set(ready_nodes)) - if missing: - msg += f" Missing: {', '.join(missing)}." - elif missing_hint and "no missing" not in missing_hint: - msg += f" {missing_hint}" - elif "should" in lower_body: - msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state." - send_msg(token, rid, msg) - continue - if "missing" in lower_body and "node" in lower_body: - missing = missing_nodes_answer("Atlas") - if missing: - send_msg(token, rid, missing) - continue - inventory_answer = node_inventory_answer("Atlas", lower_body) - if inventory_answer: - send_msg(token, rid, inventory_answer) - continue - if "node" in lower_body and any(word in lower_body for word in ("arm64", "aarch64", "amd64", "x86_64", "x86-64")): - if any(word in lower_body for word in ("cluster", "atlas", "titan")): - arch = "arm64" if "arm64" in lower_body or "aarch64" in lower_body else "amd64" - summary = nodes_arch_summary("Atlas", arch) - if not summary: - send_msg( - token, - rid, - "I couldn’t reach the cluster API to count nodes by architecture. Try again in a moment.", - ) - continue - send_msg(token, rid, summary) - continue - if re.search(r"\bnode names?\b|\bnodes?\b.*\bnamed\b|\bnaming\b", lower_body): - if any(word in lower_body for word in ("cluster", "atlas", "titan")): - names_summary = nodes_names_summary("Atlas") - if not names_summary: - send_msg(token, rid, "I couldn’t reach the cluster API to list node names. Try again in a moment.") - continue - send_msg(token, rid, names_summary) - continue - if re.search(r"\bwhich nodes are ready\b|\bnodes ready\b", lower_body): - ready_nodes, not_ready_nodes = worker_nodes_status() - if ready_nodes: - msg = f"Ready worker nodes ({len(ready_nodes)}): {', '.join(ready_nodes)}." - if not_ready_nodes: - msg += f" Not Ready: {', '.join(not_ready_nodes)}." - send_msg(token, rid, msg) - continue # Only do live cluster introspection in DMs; metrics can be answered when mentioned. allow_tools = is_dm @@ -1230,14 +1093,9 @@ def sync_loop(token: str, room_id: str): if metrics_context: context = (context + "\n\n" + metrics_context).strip() if context else metrics_context - fallback = "" - if "node" in lower_body or "cluster" in lower_body: - fallback = node_inventory_answer("Atlas", lower_body) - if metrics_fallback and not fallback: - fallback = metrics_fallback - if _should_short_circuit(body, fallback): - send_msg(token, rid, fallback) - continue + fallback = metrics_fallback or "" + if not fallback and context: + fallback = _context_fallback(context) reply = ollama_reply_with_thinking( token, rid, From 3e4351ef19a810e731fdb82d6632392e02622545 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:24:03 -0300 Subject: [PATCH 249/416] atlasbot: reload for live inventory --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index e45d9f3..4d5b31c 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-18 + checksum/atlasbot-configmap: manual-atlasbot-19 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From bf2d4cff907d841cd0a04abecc60fe24087d7696 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:29:26 -0300 Subject: [PATCH 250/416] atlasbot: answer from live inventory --- services/comms/scripts/atlasbot/bot.py | 123 +++++++++++++++++++++++-- 1 file changed, 116 insertions(+), 7 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e070ead..6fc654b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -334,11 +334,12 @@ def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: grouped[node.get("hardware") or "unknown"].append(node["name"]) return {k: sorted(v) for k, v in grouped.items()} -def node_inventory_context(query: str) -> str: +def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str: q = (query or "").lower() if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")): return "" - inventory = node_inventory_live() + if inventory is None: + inventory = node_inventory_live() if not inventory: return "" groups = _group_nodes(inventory) @@ -370,6 +371,101 @@ def node_inventory_context(query: str) -> str: lines.append(f"- workers_missing (derived): {', '.join(missing)}") return "\n".join(lines) +def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]: + q = (prompt or "").lower() + if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")): + return node_inventory_live() + return [] + +def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: + names = [node["name"] for node in inventory] + ready = [node["name"] for node in inventory if node.get("ready") is True] + not_ready = [node["name"] for node in inventory if node.get("ready") is False] + groups = _group_nodes(inventory) + return { + "names": sorted(names), + "ready": sorted(ready), + "not_ready": sorted(not_ready), + "groups": groups, + } + +def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str: + q = (prompt or "").lower() + if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")): + return metrics_summary + + if not inventory: + return "" + + sets = _inventory_sets(inventory) + names = sets["names"] + ready = sets["ready"] + not_ready = sets["not_ready"] + groups = sets["groups"] + total = len(names) + + for node in _extract_titan_nodes(q): + if node and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q): + if node in names: + return f"Yes. {node} is in the Atlas cluster." + return f"No. {node} is not in the Atlas cluster." + + if any(word in q for word in ("how many", "count", "number")) and "node" in q and "worker" not in q: + return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready." + + if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q: + return "Atlas node names: " + ", ".join(names) + "." + + if "ready" in q and "node" in q and "worker" in q: + if "not ready" in q or "unready" in q or "down" in q: + return "Worker nodes not ready: " + (", ".join(not_ready) if not_ready else "none") + "." + return "Ready worker nodes ({}): {}.".format(len(ready), ", ".join(ready)) + + if "worker" in q and any(word in q for word in ("missing", "expected", "should")): + expected_workers = expected_worker_nodes_from_metrics() + missing = sorted(set(expected_workers) - set(ready + not_ready)) if expected_workers else [] + if "missing" in q and missing: + return "Missing worker nodes: " + ", ".join(missing) + "." + if expected_workers: + msg = f"Grafana inventory expects {len(expected_workers)} workers." + if missing: + msg += f" Missing: {', '.join(missing)}." + return msg + return "No expected worker inventory found; using live cluster state." + + if "worker" in q and "node" in q and "ready" not in q and "missing" not in q: + return f"Worker nodes: {len(ready)} ready, {len(not_ready)} not ready." + + if "jetson" in q: + jets = groups.get("jetson", []) + return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found." + + if "amd64" in q or "x86" in q: + amd = groups.get("amd64", []) + return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found." + + if "rpi4" in q: + rpi4 = groups.get("rpi4", []) + return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found." + + if "rpi5" in q: + rpi5 = groups.get("rpi5", []) + return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found." + + if "raspberry" in q or "rpi" in q: + rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))) + return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found." + + if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: + non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) + return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found." + + if "arm64-unknown" in q or "unknown" in q: + unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))) + return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels." + + return "" + def _metric_tokens(entry: dict[str, Any]) -> str: parts: list[str] = [] for key in ("panel_title", "dashboard", "description"): @@ -900,7 +996,13 @@ history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] ( def key_for(room_id: str, sender: str, is_dm: bool): return (room_id, None) if is_dm else (room_id, sender) -def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, str]]) -> str: +def build_context( + prompt: str, + *, + allow_tools: bool, + targets: list[tuple[str, str]], + inventory: list[dict[str, Any]] | None = None, +) -> str: parts: list[str] = [] kb = kb_retrieve(prompt) @@ -911,9 +1013,9 @@ def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, st if endpoints: parts.append(endpoints) - inventory = node_inventory_context(prompt) - if inventory: - parts.append(inventory) + node_ctx = node_inventory_context(prompt, inventory) + if node_ctx: + parts.append(node_ctx) if allow_tools: # Scope pod summaries to relevant namespaces/workloads when possible. @@ -1083,7 +1185,8 @@ def sync_loop(token: str, room_id: str): if isinstance(w, dict) and w.get("name"): targets.append((ns, str(w["name"]))) - context = build_context(body, allow_tools=allow_tools, targets=targets) + inventory = node_inventory_for_prompt(body) + context = build_context(body, allow_tools=allow_tools, targets=targets, inventory=inventory) if allow_tools and promql: res = vm_query(promql, timeout=20) rendered = vm_render_result(res, limit=15) or "(no results)" @@ -1096,6 +1199,12 @@ def sync_loop(token: str, room_id: str): fallback = metrics_fallback or "" if not fallback and context: fallback = _context_fallback(context) + + structured = structured_answer(body, inventory=inventory, metrics_summary=metrics_fallback or "") + if structured: + send_msg(token, rid, structured) + continue + reply = ollama_reply_with_thinking( token, rid, From 2d09e7f965088d67f202216daf7b3a6e49ed2709 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:31:07 -0300 Subject: [PATCH 251/416] atlasbot: reload inventory answers --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4d5b31c..57705ec 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-19 + checksum/atlasbot-configmap: manual-atlasbot-20 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 16d0a22163d49e3dff0bc06dbaebdf2338bcf67e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:34:19 -0300 Subject: [PATCH 252/416] atlasbot: generalize inventory answers --- services/comms/scripts/atlasbot/bot.py | 80 ++++++++++++++++---------- 1 file changed, 50 insertions(+), 30 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6fc654b..d06645a 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -382,11 +382,18 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: ready = [node["name"] for node in inventory if node.get("ready") is True] not_ready = [node["name"] for node in inventory if node.get("ready") is False] groups = _group_nodes(inventory) + workers = [node for node in inventory if "worker" in (node.get("roles") or [])] + worker_names = [node["name"] for node in workers] + worker_ready = [node["name"] for node in workers if node.get("ready") is True] + worker_not_ready = [node["name"] for node in workers if node.get("ready") is False] return { "names": sorted(names), "ready": sorted(ready), "not_ready": sorted(not_ready), "groups": groups, + "worker_names": sorted(worker_names), + "worker_ready": sorted(worker_ready), + "worker_not_ready": sorted(worker_not_ready), } def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str: @@ -402,6 +409,9 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s ready = sets["ready"] not_ready = sets["not_ready"] groups = sets["groups"] + worker_names = sets["worker_names"] + worker_ready = sets["worker_ready"] + worker_not_ready = sets["worker_not_ready"] total = len(names) for node in _extract_titan_nodes(q): @@ -410,31 +420,12 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s return f"Yes. {node} is in the Atlas cluster." return f"No. {node} is not in the Atlas cluster." - if any(word in q for word in ("how many", "count", "number")) and "node" in q and "worker" not in q: - return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready." - - if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q: - return "Atlas node names: " + ", ".join(names) + "." - - if "ready" in q and "node" in q and "worker" in q: - if "not ready" in q or "unready" in q or "down" in q: - return "Worker nodes not ready: " + (", ".join(not_ready) if not_ready else "none") + "." - return "Ready worker nodes ({}): {}.".format(len(ready), ", ".join(ready)) - - if "worker" in q and any(word in q for word in ("missing", "expected", "should")): - expected_workers = expected_worker_nodes_from_metrics() - missing = sorted(set(expected_workers) - set(ready + not_ready)) if expected_workers else [] - if "missing" in q and missing: - return "Missing worker nodes: " + ", ".join(missing) + "." - if expected_workers: - msg = f"Grafana inventory expects {len(expected_workers)} workers." - if missing: - msg += f" Missing: {', '.join(missing)}." - return msg - return "No expected worker inventory found; using live cluster state." - - if "worker" in q and "node" in q and "ready" not in q and "missing" not in q: - return f"Worker nodes: {len(ready)} ready, {len(not_ready)} not ready." + if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: + non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) + if "besides" in q: + amd = groups.get("amd64", []) + return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson." + return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found." if "jetson" in q: jets = groups.get("jetson", []) @@ -446,24 +437,53 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s if "rpi4" in q: rpi4 = groups.get("rpi4", []) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(rpi4)} rpi4 nodes." return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found." if "rpi5" in q: rpi5 = groups.get("rpi5", []) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(rpi5)} rpi5 nodes." return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found." if "raspberry" in q or "rpi" in q: rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(rpi)} Raspberry Pi nodes." return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found." - if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: - non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) - return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found." - - if "arm64-unknown" in q or "unknown" in q: + if "arm64-unknown" in q or "unknown" in q or "no hardware" in q: unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))) return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels." + if "worker" in q and "node" in q: + if any(word in q for word in ("missing", "expected", "should")): + expected_workers = expected_worker_nodes_from_metrics() + missing = sorted(set(expected_workers) - set(worker_ready + worker_not_ready)) if expected_workers else [] + if "missing" in q and missing: + return "Missing worker nodes: " + ", ".join(missing) + "." + if expected_workers: + msg = f"Grafana inventory expects {len(expected_workers)} workers." + if missing: + msg += f" Missing: {', '.join(missing)}." + return msg + return "No expected worker inventory found; using live cluster state." + if "not ready" in q or "unready" in q or "down" in q: + return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "." + if any(word in q for word in ("how many", "count", "number")): + return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready." + return "Ready worker nodes ({}): {}.".format(len(worker_ready), ", ".join(worker_ready)) + + if any(word in q for word in ("how many", "count", "number")) and "node" in q: + return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready." + + if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q: + return "Atlas node names: " + ", ".join(names) + "." + + if "ready" in q and "node" in q: + return f"Ready nodes ({len(ready)}): {', '.join(ready)}." + return "" def _metric_tokens(entry: dict[str, Any]) -> str: From a61091c052094adece4c4104cdcadaac1379c06b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:34:42 -0300 Subject: [PATCH 253/416] atlasbot: reload structured answers --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 57705ec..c723d22 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-20 + checksum/atlasbot-configmap: manual-atlasbot-21 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From b27c80d5c0f7221c663634491d37c08f1cccaa83 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:53:11 -0300 Subject: [PATCH 254/416] atlasbot: improve node inventory reasoning --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 154 +++++++++++++++++++----- 2 files changed, 122 insertions(+), 34 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index c723d22..7cc66b3 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-21 + checksum/atlasbot-configmap: manual-atlasbot-22 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index d06645a..6993db2 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -89,9 +89,17 @@ METRIC_HINT_WORDS = { "latency", } -CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL) -TITAN_NODE_RE = re.compile(r"\\btitan-[0-9a-z]{2}\\b", re.IGNORECASE) -TITAN_RANGE_RE = re.compile(r"\\btitan-([0-9a-z]{2})/([0-9a-z]{2})\\b", re.IGNORECASE) +CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) +TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE) +TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE) +_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D" + +def normalize_query(text: str) -> str: + cleaned = (text or "").lower() + for ch in _DASH_CHARS: + cleaned = cleaned.replace(ch, "-") + cleaned = re.sub(r"\s+", " ", cleaned).strip() + return cleaned def _tokens(text: str) -> list[str]: toks = [t.lower() for t in TOKEN_RE.findall(text or "")] @@ -267,14 +275,15 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: return "\n".join(parts).strip() def _extract_titan_nodes(text: str) -> list[str]: - names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n} - for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", text or "", re.IGNORECASE): + cleaned = normalize_query(text) + names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n} + for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", cleaned, re.IGNORECASE): tail = match.group(1) for part in re.split(r"[/,]", tail): part = part.strip() if part: names.add(f"titan-{part.lower()}") - for match in TITAN_RANGE_RE.finditer(text or ""): + for match in TITAN_RANGE_RE.finditer(cleaned): left, right = match.groups() if left: names.add(f"titan-{left.lower()}") @@ -323,6 +332,7 @@ def node_inventory_live() -> list[dict[str, Any]]: "arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "", "hardware": _hardware_class(labels), "roles": _node_roles(labels), + "is_worker": _node_is_worker(node), "ready": _node_ready_status(node), } ) @@ -335,7 +345,7 @@ def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: return {k: sorted(v) for k, v in grouped.items()} def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str: - q = (query or "").lower() + q = normalize_query(query) if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")): return "" if inventory is None: @@ -372,7 +382,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = return "\n".join(lines) def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]: - q = (prompt or "").lower() + q = normalize_query(prompt) if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")): return node_inventory_live() return [] @@ -382,10 +392,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: ready = [node["name"] for node in inventory if node.get("ready") is True] not_ready = [node["name"] for node in inventory if node.get("ready") is False] groups = _group_nodes(inventory) - workers = [node for node in inventory if "worker" in (node.get("roles") or [])] + workers = [node for node in inventory if node.get("is_worker") is True] worker_names = [node["name"] for node in workers] worker_ready = [node["name"] for node in workers if node.get("ready") is True] worker_not_ready = [node["name"] for node in workers if node.get("ready") is False] + expected_workers = expected_worker_nodes_from_metrics() + expected_ready = [n for n in expected_workers if n in ready] if expected_workers else [] + expected_not_ready = [n for n in expected_workers if n in not_ready] if expected_workers else [] + expected_missing = [n for n in expected_workers if n not in names] if expected_workers else [] return { "names": sorted(names), "ready": sorted(ready), @@ -394,10 +408,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: "worker_names": sorted(worker_names), "worker_ready": sorted(worker_ready), "worker_not_ready": sorted(worker_not_ready), + "expected_workers": expected_workers, + "expected_ready": sorted(expected_ready), + "expected_not_ready": sorted(expected_not_ready), + "expected_missing": sorted(expected_missing), } def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str: - q = (prompt or "").lower() + q = normalize_query(prompt) if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")): return metrics_summary @@ -412,29 +430,75 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s worker_names = sets["worker_names"] worker_ready = sets["worker_ready"] worker_not_ready = sets["worker_not_ready"] + expected_workers = sets["expected_workers"] + expected_ready = sets["expected_ready"] + expected_not_ready = sets["expected_not_ready"] + expected_missing = sets["expected_missing"] total = len(names) + nodes_in_query = _extract_titan_nodes(q) + rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", [])) + non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", [])) + unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])) - for node in _extract_titan_nodes(q): - if node and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q): + if nodes_in_query and ("raspberry" in q or "rpi" in q): + parts: list[str] = [] + for node in nodes_in_query: + if node in rpi_nodes: + parts.append(f"{node} is a Raspberry Pi node.") + elif node in non_rpi: + parts.append(f"{node} is not a Raspberry Pi node.") + elif node in names: + parts.append(f"{node} is in Atlas but hardware is unknown.") + else: + parts.append(f"{node} is not in the Atlas cluster.") + return " ".join(parts) + + if nodes_in_query and "jetson" in q: + jets = set(groups.get("jetson", [])) + parts = [] + for node in nodes_in_query: + if node in jets: + parts.append(f"{node} is a Jetson node.") + elif node in names: + parts.append(f"{node} is not a Jetson node.") + else: + parts.append(f"{node} is not in the Atlas cluster.") + return " ".join(parts) + + if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q): + parts: list[str] = [] + for node in nodes_in_query: if node in names: - return f"Yes. {node} is in the Atlas cluster." - return f"No. {node} is not in the Atlas cluster." + parts.append(f"Yes. {node} is in the Atlas cluster.") + else: + parts.append(f"No. {node} is not in the Atlas cluster.") + return " ".join(parts) - if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: - non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) - if "besides" in q: - amd = groups.get("amd64", []) + if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")): + non_rpi_sorted = sorted(non_rpi) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes." + if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")): + amd = sorted(groups.get("amd64", [])) return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson." - return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found." + return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found." if "jetson" in q: jets = groups.get("jetson", []) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(jets)} Jetson nodes." return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found." if "amd64" in q or "x86" in q: amd = groups.get("amd64", []) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(amd)} amd64 nodes." return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found." + if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")): + count = sum(1 for node in inventory if node.get("arch") == "arm64") + return f"Atlas has {count} arm64 nodes." + if "rpi4" in q: rpi4 = groups.get("rpi4", []) if any(word in q for word in ("how many", "count", "number")): @@ -448,28 +512,52 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found." if "raspberry" in q or "rpi" in q: - rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))) + rpi = sorted(rpi_nodes) if any(word in q for word in ("how many", "count", "number")): return f"Atlas has {len(rpi)} Raspberry Pi nodes." return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found." if "arm64-unknown" in q or "unknown" in q or "no hardware" in q: - unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))) + unknown = sorted(unknown_hw) return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels." - if "worker" in q and "node" in q: - if any(word in q for word in ("missing", "expected", "should")): - expected_workers = expected_worker_nodes_from_metrics() - missing = sorted(set(expected_workers) - set(worker_ready + worker_not_ready)) if expected_workers else [] - if "missing" in q and missing: - return "Missing worker nodes: " + ", ".join(missing) + "." - if expected_workers: - msg = f"Grafana inventory expects {len(expected_workers)} workers." - if missing: - msg += f" Missing: {', '.join(missing)}." + if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q): + return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "." + + if "worker" in q and ("node" in q or "nodes" in q or "workers" in q): + not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q) + if expected_workers: + if "missing" in q: + return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "." + if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q): + return ( + f"Expected workers: {len(expected_ready)} ready, " + f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})." + ) + if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q): + msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." + if expected_missing: + msg += f" Missing: {', '.join(expected_missing)}." return msg - return "No expected worker inventory found; using live cluster state." - if "not ready" in q or "unready" in q or "down" in q: + if not_ready_query: + if expected_not_ready or expected_missing: + detail = [] + if expected_not_ready: + detail.append(f"Not ready: {', '.join(expected_not_ready)}") + if expected_missing: + detail.append(f"Missing: {', '.join(expected_missing)}") + return "Worker nodes needing attention. " + " ".join(detail) + "." + return "All expected worker nodes are Ready." + if any(word in q for word in ("expected", "expect", "should")): + msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." + if expected_missing: + msg += f" Missing: {', '.join(expected_missing)}." + return msg + if any(word in q for word in ("how many", "count", "number")): + return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})." + if "ready" in q: + return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}." + if not_ready_query: return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "." if any(word in q for word in ("how many", "count", "number")): return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready." From b0abb9bd6e9f8f2c40db817f233e5f8a60f3d355 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 20:54:33 -0300 Subject: [PATCH 255/416] ariadne: reduce comms noise, fix gpu labels --- scripts/dashboards_render_atlas.py | 4 ++-- services/comms/mas-local-users-ensure-job.yaml | 2 +- services/comms/synapse-seeder-admin-ensure-job.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 2 +- services/monitoring/dashboards/atlas-overview.json | 2 +- services/monitoring/dcgm-exporter.yaml | 2 ++ services/monitoring/grafana-dashboard-overview.yaml | 2 +- 7 files changed, 9 insertions(+), 7 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 11479d9..5aa77dc 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -364,9 +364,9 @@ ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' ARIADNE_TEST_SUCCESS_RATE = ( "100 * " - 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[1h])) ' + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) ' "/ clamp_min(" - 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[1h])), 1)' + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)' ) ARIADNE_TEST_FAILURES_24H = ( 'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/mas-local-users-ensure-job.yaml index 5802009..c8cf5f0 100644 --- a/services/comms/mas-local-users-ensure-job.yaml +++ b/services/comms/mas-local-users-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: mas-local-users-ensure-15 + name: mas-local-users-ensure-16 namespace: comms spec: backoffLimit: 1 diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/synapse-seeder-admin-ensure-job.yaml index 9905658..ce8ccd3 100644 --- a/services/comms/synapse-seeder-admin-ensure-job.yaml +++ b/services/comms/synapse-seeder-admin-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: synapse-seeder-admin-ensure-7 + name: synapse-seeder-admin-ensure-8 namespace: comms spec: backoffLimit: 2 diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 0356e06..33b8a12 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -306,7 +306,7 @@ spec: - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME value: "*/5 * * * *" - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE - value: "*/30 * * * *" + value: "0 0 1 * *" - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM value: "0 0 1 1 *" - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 2d7f3e5..486cd61 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1690,7 +1690,7 @@ }, "targets": [ { - "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)", + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)", "refId": "A" } ], diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index 8760c9f..3e8d1a6 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -50,6 +50,8 @@ spec: env: - name: DCGM_EXPORTER_KUBERNETES value: "true" + - name: KUBERNETES_VIRTUAL_GPUS + value: "true" securityContext: privileged: true resources: diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 5336134..afc1e1f 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1699,7 +1699,7 @@ data: }, "targets": [ { - "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)", + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)", "refId": "A" } ], From 879a75142956465014beea4630de3db6c71e1681 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Mon, 26 Jan 2026 23:54:53 +0000 Subject: [PATCH 256/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 3933caf..2678a46 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-49 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-50 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 72bd22e912437fa4d052658f93e969c43ba36934 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 20:57:47 -0300 Subject: [PATCH 257/416] monitoring: map dcgm to shared gpu resources --- services/monitoring/dcgm-exporter.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index 3e8d1a6..ff5aed5 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -52,6 +52,8 @@ spec: value: "true" - name: KUBERNETES_VIRTUAL_GPUS value: "true" + - name: NVIDIA_RESOURCE_NAMES + value: nvidia.com/gpu.shared securityContext: privileged: true resources: From 6432472be7724fbc95bf966f9a0d94a336ad055e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:13:04 -0300 Subject: [PATCH 258/416] atlasbot: answer hottest node queries via metrics --- services/comms/scripts/atlasbot/bot.py | 94 ++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6993db2..233b25e 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -18,6 +18,8 @@ OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") API_KEY = os.environ.get("CHAT_API_KEY", "") OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) +ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090")) +ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "") KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") @@ -93,6 +95,12 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE) TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE) _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D" +HOTTEST_QUERIES = { + "cpu": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "ram": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "net": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "io": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", +} def normalize_query(text: str) -> str: cleaned = (text or "").lower() @@ -291,6 +299,77 @@ def _extract_titan_nodes(text: str) -> list[str]: names.add(f"titan-{right.lower()}") return sorted(names) +def _humanize_rate(value: str, *, unit: str) -> str: + try: + val = float(value) + except (TypeError, ValueError): + return value + if unit == "%": + return f"{val:.1f}%" + if val >= 1024 * 1024: + return f"{val / (1024 * 1024):.2f} MB/s" + if val >= 1024: + return f"{val / 1024:.2f} KB/s" + return f"{val:.2f} B/s" + +def _hottest_query(metric: str, node_regex: str | None) -> str: + expr = HOTTEST_QUERIES[metric] + if node_regex: + needle = 'node_uname_info{nodename!=""}' + replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}' + return expr.replace(needle, replacement) + return expr + +def _vm_hottest(metric: str, node_regex: str | None) -> tuple[str, str] | None: + expr = _hottest_query(metric, node_regex) + res = vm_query(expr) + series = _vm_value_series(res) + if not series: + return None + first = series[0] + labels = first.get("metric") or {} + value = first.get("value") or [] + val = value[1] if isinstance(value, list) and len(value) > 1 else "" + node = labels.get("node") or labels.get("__name__") or "" + if not node: + return None + return (str(node), str(val)) + +def _hottest_answer(q: str, *, nodes: list[str] | None) -> str: + metric = None + assumed_cpu = False + if "cpu" in q: + metric = "cpu" + elif "ram" in q or "memory" in q: + metric = "ram" + elif "net" in q or "network" in q: + metric = "net" + elif "io" in q or "disk" in q or "storage" in q: + metric = "io" + if metric is None: + metric = "cpu" + assumed_cpu = True + if nodes is not None and not nodes: + return "No nodes match the requested hardware class." + + node_regex = "|".join(nodes) if nodes else None + metrics = [metric] + lines: list[str] = [] + for m in metrics: + picked = _vm_hottest(m, node_regex) + if not picked: + continue + node, val = picked + unit = "%" if m in ("cpu", "ram") else "B/s" + val_str = _humanize_rate(val, unit=unit) + label = {"cpu": "CPU", "ram": "RAM", "net": "NET", "io": "I/O"}[m] + lines.append(f"{label}: {node} ({val_str})") + if not lines: + return "" + label = metric.upper() + suffix = " (defaulting to CPU)" if assumed_cpu else "" + return f"Hottest node by {label}: {lines[0].split(': ', 1)[1]}.{suffix}" + def _node_roles(labels: dict[str, Any]) -> list[str]: roles: list[str] = [] for key in labels.keys(): @@ -440,6 +519,21 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", [])) unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])) + if "hottest" in q or "hot" in q: + filter_nodes: list[str] | None = None + if "amd64" in q or "x86" in q: + filter_nodes = sorted(groups.get("amd64", [])) + elif "jetson" in q: + filter_nodes = sorted(groups.get("jetson", [])) + elif "raspberry" in q or "rpi" in q: + filter_nodes = sorted(rpi_nodes) + elif "arm64" in q: + filter_nodes = sorted([n for n in names if n not in groups.get("amd64", [])]) + hottest = _hottest_answer(q, nodes=filter_nodes) + if hottest: + return hottest + return "Unable to determine hottest nodes right now (metrics unavailable)." + if nodes_in_query and ("raspberry" in q or "rpi" in q): parts: list[str] = [] for node in nodes_in_query: From 8c90e0e5274dbdec6b2c9861e0cc3cd31cb3416d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:13:53 -0300 Subject: [PATCH 259/416] comms: restart atlasbot for hottest node fix --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7cc66b3..d5ad62e 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-22 + checksum/atlasbot-configmap: manual-atlasbot-23 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From ec834b7e0fc98dc790868c322e60acf0d789f109 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:26:13 -0300 Subject: [PATCH 260/416] vault: allow ariadne to use vault-admin role --- services/vault/scripts/vault_k8s_auth_configure.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index a956e0e..21132c7 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -193,8 +193,8 @@ path "kv/data/atlas/shared/*" { write_raw_policy "dev-kv" "${dev_kv_policy}" log "writing role vault-admin" vault_cmd write "auth/kubernetes/role/vault-admin" \ - bound_service_account_names="vault-admin" \ - bound_service_account_namespaces="vault" \ + bound_service_account_names="vault-admin,ariadne" \ + bound_service_account_namespaces="vault,maintenance" \ policies="vault-admin" \ ttl="${role_ttl}" From 1616994b19017c464ca75c2afc8d609948504ade Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:26:24 -0300 Subject: [PATCH 261/416] monitoring: unify jetson gpu metrics --- scripts/dashboards_render_atlas.py | 33 ++++++++++++++++++- services/monitoring/dashboards/atlas-gpu.json | 4 +-- .../monitoring/dashboards/atlas-overview.json | 2 +- .../monitoring/grafana-dashboard-gpu.yaml | 4 +-- .../grafana-dashboard-overview.yaml | 2 +- .../jetson-tegrastats-exporter.yaml | 4 +++ .../scripts/jetson_tegrastats_exporter.py | 4 ++- 7 files changed, 45 insertions(+), 8 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 5aa77dc..675fec5 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -208,7 +208,38 @@ def namespace_ram_raw(scope_var): def namespace_gpu_usage_instant(scope_var): - return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)" + dcgm = f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)" + jetson = jetson_gpu_usage_by_namespace(scope_var) + merged = ( + f'label_replace({dcgm}, "source", "dcgm", "", "") ' + f'or label_replace({jetson}, "source", "jetson", "", "")' + ) + return f"sum by (namespace) ({merged})" + + +def jetson_gpu_util_by_node(): + return 'max by (node) (jetson_gr3d_freq_percent{node!=""})' + + +def jetson_gpu_requests(scope_var): + return ( + "sum by (namespace,node) (" + f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} ' + "* on(namespace,pod) group_left(node) kube_pod_info " + '* on(node) group_left(label_jetson) kube_node_labels{label_jetson="true"}' + ")" + ) + + +def jetson_gpu_usage_by_namespace(scope_var): + requests_by_ns = jetson_gpu_requests(scope_var) + total_by_node = f"sum by (node) ({requests_by_ns})" + return ( + "sum by (namespace) (" + f"({requests_by_ns}) / clamp_min({total_by_node}, 1) " + f"* on(node) group_left() {jetson_gpu_util_by_node()}" + ")" + ) def namespace_share_expr(resource_expr): diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index af8a1c5..6b76a5c 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -89,7 +89,7 @@ }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)", + "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 486cd61..04352f9 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1901,7 +1901,7 @@ }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index d7950f2..46b25cd 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -98,7 +98,7 @@ data: }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)", + "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index afc1e1f..9495647 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1910,7 +1910,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 8788b20..a835401 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -44,6 +44,10 @@ spec: env: - name: JETSON_EXPORTER_PORT value: "9100" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName volumeMounts: - name: script mountPath: /etc/tegrastats-exporter diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index cd557e7..c4d3fa2 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -7,6 +7,7 @@ import threading from time import time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) +NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename METRICS = { "gr3d_freq_percent": 0.0, "gpu_temp_c": 0.0, @@ -60,9 +61,10 @@ class Handler(http.server.BaseHTTPRequestHandler): with LOCK: metrics = METRICS.copy() out = [] + label = f'{{node="{NODE_NAME}"}}' for k, v in metrics.items(): out.append(f"# TYPE jetson_{k} gauge") - out.append(f"jetson_{k} {v}") + out.append(f"jetson_{k}{label} {v}") body = "\\n".join(out) + "\\n" self.send_response(200) self.send_header("Content-Type", "text/plain; version=0.0.4") From 6c413d4a5033b8cccda55186c14dca9d35bb0e41 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 01:27:02 +0000 Subject: [PATCH 262/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 2678a46..7528f6f 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-50 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-51 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 37a203509bdd85ab590882ab543cb7d68d04d7a1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:38:18 -0300 Subject: [PATCH 263/416] atlasbot: replace targeted handlers with generic planner --- services/comms/scripts/atlasbot/bot.py | 573 ++++++++++--------------- 1 file changed, 235 insertions(+), 338 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 233b25e..987df7a 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -95,11 +95,29 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE) TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE) _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D" -HOTTEST_QUERIES = { - "cpu": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", - "ram": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", - "net": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", - "io": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + +OPERATION_HINTS = { + "count": ("how many", "count", "number", "total"), + "list": ("list", "which", "what are", "show", "names"), + "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum"), + "status": ("ready", "not ready", "unready", "down", "missing", "status"), +} + +METRIC_HINTS = { + "cpu": ("cpu",), + "ram": ("ram", "memory", "mem"), + "net": ("net", "network", "bandwidth", "throughput"), + "io": ("io", "disk", "storage"), + "connections": ("connections", "conn", "postgres", "database", "db"), +} + +HARDWARE_HINTS = { + "amd64": ("amd64", "x86", "x86_64", "x86-64"), + "jetson": ("jetson",), + "rpi4": ("rpi4",), + "rpi5": ("rpi5",), + "rpi": ("rpi", "raspberry"), + "arm64": ("arm64", "aarch64"), } def normalize_query(text: str) -> str: @@ -312,63 +330,127 @@ def _humanize_rate(value: str, *, unit: str) -> str: return f"{val / 1024:.2f} KB/s" return f"{val:.2f} B/s" -def _hottest_query(metric: str, node_regex: str | None) -> str: - expr = HOTTEST_QUERIES[metric] - if node_regex: - needle = 'node_uname_info{nodename!=""}' - replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}' - return expr.replace(needle, replacement) - return expr +def _has_any(text: str, phrases: tuple[str, ...]) -> bool: + return any(p in text for p in phrases) -def _vm_hottest(metric: str, node_regex: str | None) -> tuple[str, str] | None: - expr = _hottest_query(metric, node_regex) - res = vm_query(expr) - series = _vm_value_series(res) - if not series: - return None - first = series[0] - labels = first.get("metric") or {} - value = first.get("value") or [] - val = value[1] if isinstance(value, list) and len(value) > 1 else "" - node = labels.get("node") or labels.get("__name__") or "" - if not node: - return None - return (str(node), str(val)) +def _detect_operation(q: str) -> str | None: + for op, phrases in OPERATION_HINTS.items(): + if _has_any(q, phrases): + return op + return None -def _hottest_answer(q: str, *, nodes: list[str] | None) -> str: - metric = None - assumed_cpu = False - if "cpu" in q: - metric = "cpu" - elif "ram" in q or "memory" in q: - metric = "ram" - elif "net" in q or "network" in q: - metric = "net" - elif "io" in q or "disk" in q or "storage" in q: - metric = "io" - if metric is None: - metric = "cpu" - assumed_cpu = True - if nodes is not None and not nodes: - return "No nodes match the requested hardware class." +def _detect_metric(q: str) -> str | None: + for metric, phrases in METRIC_HINTS.items(): + if _has_any(q, phrases): + return metric + return None - node_regex = "|".join(nodes) if nodes else None - metrics = [metric] - lines: list[str] = [] - for m in metrics: - picked = _vm_hottest(m, node_regex) - if not picked: +def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: + include: set[str] = set() + exclude: set[str] = set() + for hardware, phrases in HARDWARE_HINTS.items(): + for phrase in phrases: + if f"non {phrase}" in q or f"non-{phrase}" in q or f"not {phrase}" in q: + exclude.add(hardware) + elif phrase in q: + include.add(hardware) + return include, exclude + +def _detect_entity(q: str) -> str | None: + if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q): + return "node" + if "pod" in q or "pods" in q: + return "pod" + if "namespace" in q or "namespaces" in q: + return "namespace" + return None + +def _metric_entry_score(entry: dict[str, Any], tokens: list[str], *, metric: str | None, op: str | None) -> int: + hay = _metric_tokens(entry) + score = 0 + for t in set(tokens): + if t in hay: + score += 2 if t in (entry.get("panel_title") or "").lower() else 1 + if metric: + for phrase in METRIC_HINTS.get(metric, (metric,)): + if phrase in hay: + score += 3 + if op == "top" and ("hottest" in hay or "top" in hay): + score += 3 + if "node" in hay: + score += 1 + return score + +def _select_metric_entry(tokens: list[str], *, metric: str | None, op: str | None) -> dict[str, Any] | None: + scored: list[tuple[int, dict[str, Any]]] = [] + for entry in _METRIC_INDEX: + if not isinstance(entry, dict): continue - node, val = picked - unit = "%" if m in ("cpu", "ram") else "B/s" - val_str = _humanize_rate(val, unit=unit) - label = {"cpu": "CPU", "ram": "RAM", "net": "NET", "io": "I/O"}[m] - lines.append(f"{label}: {node} ({val_str})") - if not lines: + score = _metric_entry_score(entry, tokens, metric=metric, op=op) + if score: + scored.append((score, entry)) + if not scored: + return None + scored.sort(key=lambda item: item[0], reverse=True) + return scored[0][1] + +def _apply_node_filter(expr: str, node_regex: str | None) -> str: + if not node_regex: + return expr + needle = 'node_uname_info{nodename!=""}' + replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}' + return expr.replace(needle, replacement) + +def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str: + series = _vm_value_series(res) + panel = entry.get("panel_title") or "Metric" + if not series: return "" - label = metric.upper() - suffix = " (defaulting to CPU)" if assumed_cpu else "" - return f"Hottest node by {label}: {lines[0].split(': ', 1)[1]}.{suffix}" + rendered = vm_render_result(res, limit=5) + if not rendered: + return "" + lines = [line.lstrip("-").strip() for line in rendered.splitlines() if line.strip().startswith("-")] + if len(lines) == 1: + return f"{panel}: {lines[0]}." + return f"{panel}:\n" + "\n".join(f"- {line}" for line in lines) + +def _inventory_filter( + inventory: list[dict[str, Any]], + *, + include_hw: set[str], + exclude_hw: set[str], + only_workers: bool, + only_ready: bool | None, + nodes_in_query: list[str], +) -> list[dict[str, Any]]: + results = inventory + if nodes_in_query: + results = [node for node in results if node.get("name") in nodes_in_query] + if only_workers: + results = [node for node in results if node.get("is_worker") is True] + if only_ready is True: + results = [node for node in results if node.get("ready") is True] + if only_ready is False: + results = [node for node in results if node.get("ready") is False] + if include_hw: + results = [node for node in results if _hardware_match(node, include_hw)] + if exclude_hw: + results = [node for node in results if not _hardware_match(node, exclude_hw)] + return results + +def _hardware_match(node: dict[str, Any], filters: set[str]) -> bool: + hw = node.get("hardware") or "" + arch = node.get("arch") or "" + for f in filters: + if f == "rpi" and hw in ("rpi4", "rpi5"): + return True + if f == "arm64" and arch == "arm64": + return True + if hw == f: + return True + if f == "amd64" and arch == "amd64": + return True + return False def _node_roles(labels: dict[str, Any]) -> list[str]: roles: list[str] = [] @@ -495,176 +577,103 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str: q = normalize_query(prompt) - if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")): - return metrics_summary - - if not inventory: + if not q: return "" - sets = _inventory_sets(inventory) - names = sets["names"] - ready = sets["ready"] - not_ready = sets["not_ready"] - groups = sets["groups"] - worker_names = sets["worker_names"] - worker_ready = sets["worker_ready"] - worker_not_ready = sets["worker_not_ready"] - expected_workers = sets["expected_workers"] - expected_ready = sets["expected_ready"] - expected_not_ready = sets["expected_not_ready"] - expected_missing = sets["expected_missing"] - total = len(names) + tokens = _tokens(q) + op = _detect_operation(q) + metric = _detect_metric(q) + entity = _detect_entity(q) + include_hw, exclude_hw = _detect_hardware_filters(q) nodes_in_query = _extract_titan_nodes(q) - rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", [])) - non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", [])) - unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])) + only_workers = "worker" in q or "workers" in q + only_ready: bool | None = None + if "not ready" in q or "unready" in q or "down" in q or "missing" in q: + only_ready = False + elif "ready" in q: + only_ready = True - if "hottest" in q or "hot" in q: - filter_nodes: list[str] | None = None - if "amd64" in q or "x86" in q: - filter_nodes = sorted(groups.get("amd64", [])) - elif "jetson" in q: - filter_nodes = sorted(groups.get("jetson", [])) - elif "raspberry" in q or "rpi" in q: - filter_nodes = sorted(rpi_nodes) - elif "arm64" in q: - filter_nodes = sorted([n for n in names if n not in groups.get("amd64", [])]) - hottest = _hottest_answer(q, nodes=filter_nodes) - if hottest: - return hottest - return "Unable to determine hottest nodes right now (metrics unavailable)." + if entity == "node" and only_ready is not None and op != "count": + op = "status" - if nodes_in_query and ("raspberry" in q or "rpi" in q): - parts: list[str] = [] - for node in nodes_in_query: - if node in rpi_nodes: - parts.append(f"{node} is a Raspberry Pi node.") - elif node in non_rpi: - parts.append(f"{node} is not a Raspberry Pi node.") - elif node in names: - parts.append(f"{node} is in Atlas but hardware is unknown.") - else: - parts.append(f"{node} is not in the Atlas cluster.") - return " ".join(parts) + if not op and entity == "node": + op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count" - if nodes_in_query and "jetson" in q: - jets = set(groups.get("jetson", [])) - parts = [] - for node in nodes_in_query: - if node in jets: - parts.append(f"{node} is a Jetson node.") - elif node in names: - parts.append(f"{node} is not a Jetson node.") - else: - parts.append(f"{node} is not in the Atlas cluster.") - return " ".join(parts) + if op == "top" and metric is None: + metric = "cpu" - if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q): - parts: list[str] = [] - for node in nodes_in_query: - if node in names: - parts.append(f"Yes. {node} is in the Atlas cluster.") - else: - parts.append(f"No. {node} is not in the Atlas cluster.") - return " ".join(parts) - - if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")): - non_rpi_sorted = sorted(non_rpi) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes." - if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")): - amd = sorted(groups.get("amd64", [])) - return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson." - return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found." - - if "jetson" in q: - jets = groups.get("jetson", []) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(jets)} Jetson nodes." - return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found." - - if "amd64" in q or "x86" in q: - amd = groups.get("amd64", []) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(amd)} amd64 nodes." - return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found." - - if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")): - count = sum(1 for node in inventory if node.get("arch") == "arm64") - return f"Atlas has {count} arm64 nodes." - - if "rpi4" in q: - rpi4 = groups.get("rpi4", []) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(rpi4)} rpi4 nodes." - return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found." - - if "rpi5" in q: - rpi5 = groups.get("rpi5", []) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(rpi5)} rpi5 nodes." - return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found." - - if "raspberry" in q or "rpi" in q: - rpi = sorted(rpi_nodes) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(rpi)} Raspberry Pi nodes." - return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found." - - if "arm64-unknown" in q or "unknown" in q or "no hardware" in q: - unknown = sorted(unknown_hw) - return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels." - - if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q): - return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "." - - if "worker" in q and ("node" in q or "nodes" in q or "workers" in q): - not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q) - if expected_workers: - if "missing" in q: - return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "." - if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q): - return ( - f"Expected workers: {len(expected_ready)} ready, " - f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})." + # Metrics-first when a metric or top operation is requested. + if metric or op == "top": + entry = _select_metric_entry(tokens, metric=metric, op=op) + if entry and isinstance(entry.get("exprs"), list) and entry["exprs"]: + expr = entry["exprs"][0] + if inventory: + scoped = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=only_workers, + only_ready=None, + nodes_in_query=nodes_in_query, ) - if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q): - msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." - if expected_missing: - msg += f" Missing: {', '.join(expected_missing)}." - return msg - if not_ready_query: - if expected_not_ready or expected_missing: - detail = [] - if expected_not_ready: - detail.append(f"Not ready: {', '.join(expected_not_ready)}") - if expected_missing: - detail.append(f"Missing: {', '.join(expected_missing)}") - return "Worker nodes needing attention. " + " ".join(detail) + "." - return "All expected worker nodes are Ready." - if any(word in q for word in ("expected", "expect", "should")): - msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." - if expected_missing: - msg += f" Missing: {', '.join(expected_missing)}." - return msg - if any(word in q for word in ("how many", "count", "number")): - return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})." - if "ready" in q: - return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}." - if not_ready_query: - return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "." - if any(word in q for word in ("how many", "count", "number")): - return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready." - return "Ready worker nodes ({}): {}.".format(len(worker_ready), ", ".join(worker_ready)) + if scoped: + node_regex = "|".join([n["name"] for n in scoped]) + expr = _apply_node_filter(expr, node_regex) + res = vm_query(expr, timeout=20) + answer = _format_metric_answer(entry, res) + if answer: + return answer + if metrics_summary: + return metrics_summary - if any(word in q for word in ("how many", "count", "number")) and "node" in q: - return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready." + if entity != "node" or not inventory: + if any(word in q for word in METRIC_HINT_WORDS) and not metrics_summary: + return "I don't have data to answer that right now." + return "" - if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q: - return "Atlas node names: " + ", ".join(names) + "." + expected_workers = expected_worker_nodes_from_metrics() + filtered = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=only_workers, + only_ready=only_ready if op in ("status", "count") else None, + nodes_in_query=nodes_in_query, + ) + names = [node["name"] for node in filtered] - if "ready" in q and "node" in q: - return f"Ready nodes ({len(ready)}): {', '.join(ready)}." + if op == "status": + if "missing" in q and expected_workers: + missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) + return "Missing nodes: " + (", ".join(missing) if missing else "none") + "." + if only_ready is False: + return "Not ready nodes: " + (", ".join(names) if names else "none") + "." + if only_ready is True: + return f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + "." + + if op == "count": + if expected_workers and ("expected" in q or "should" in q): + missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) + msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." + if missing: + msg += f" Missing: {', '.join(missing)}." + return msg + if not (include_hw or exclude_hw or nodes_in_query or only_workers): + return f"Atlas has {len(names)} nodes." + return f"Matching nodes: {len(names)}." + + if op == "list": + if nodes_in_query: + parts = [] + existing = {n["name"] for n in inventory} + for node in nodes_in_query: + parts.append(f"{node}: {'present' if node in existing else 'not present'}") + return "Node presence: " + ", ".join(parts) + "." + if not names: + return "Matching nodes: none." + shown = names[:30] + suffix = f", … (+{len(names) - 30} more)" if len(names) > 30 else "" + return "Matching nodes: " + ", ".join(shown) + suffix + "." return "" @@ -727,25 +736,6 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: fallback = _metrics_fallback_summary(panel, summary) return context, fallback -def jetson_nodes_from_kb() -> list[str]: - for doc in KB.get("runbooks", []): - if not isinstance(doc, dict): - continue - body = str(doc.get("body") or "") - for line in body.splitlines(): - if "jetson" not in line.lower(): - continue - names = _extract_titan_nodes(line) - if names: - return names - return [] - -def jetson_nodes_summary(cluster_name: str) -> str: - names = jetson_nodes_from_kb() - if names: - return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}." - return "" - def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]: q = (query or "").strip() if not q or not KB.get("catalog"): @@ -953,22 +943,16 @@ def _parse_metric_lines(summary: str) -> dict[str, str]: def _metrics_fallback_summary(panel: str, summary: str) -> str: parsed = _parse_metric_lines(summary) panel_l = (panel or "").lower() - if panel_l.startswith("postgres connections"): - used = parsed.get("conn=used") - maxv = parsed.get("conn=max") - if used and maxv: - try: - used_i = int(float(used)) - max_i = int(float(maxv)) - except ValueError: - return f"Postgres connections: {summary}" - free = max_i - used_i - return f"Postgres connections: {used_i}/{max_i} used ({free} free)." - if panel_l.startswith("postgres hottest"): - if parsed: - label, value = next(iter(parsed.items())) - return f"Most Postgres connections: {label} = {value}." - return f"{panel}: {summary}" + if parsed: + items = list(parsed.items()) + if len(items) == 1: + label, value = items[0] + return f"{panel}: {label} = {value}." + compact = "; ".join(f"{k}={v}" for k, v in items) + return f"{panel}: {compact}." + if panel_l: + return f"{panel}: {summary}" + return summary def _node_ready_status(node: dict) -> bool | None: conditions = node.get("status", {}).get("conditions") or [] @@ -1075,93 +1059,6 @@ def vm_cluster_snapshot() -> str: parts.append(pr) return "\n".join(parts).strip() -def nodes_summary(cluster_name: str) -> str: - state = _ariadne_state() - if state: - nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {} - total = nodes.get("total") - ready = nodes.get("ready") - not_ready = nodes.get("not_ready") - if isinstance(total, int) and isinstance(ready, int): - not_ready = not_ready if isinstance(not_ready, int) else max(total - ready, 0) - if not_ready: - return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady." - return f"{cluster_name} cluster has {total} nodes, all Ready." - try: - data = k8s_get("/api/v1/nodes?limit=500") - except Exception: - return "" - items = data.get("items") or [] - if not isinstance(items, list) or not items: - return "" - total = len(items) - ready = 0 - for node in items: - conditions = node.get("status", {}).get("conditions") or [] - for cond in conditions if isinstance(conditions, list) else []: - if cond.get("type") == "Ready": - if cond.get("status") == "True": - ready += 1 - break - not_ready = max(total - ready, 0) - if not_ready: - return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady." - return f"{cluster_name} cluster has {total} nodes, all Ready." - -def nodes_names_summary(cluster_name: str) -> str: - state = _ariadne_state() - if state: - nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {} - names = nodes.get("names") - if isinstance(names, list) and names: - cleaned = sorted({str(n) for n in names if n}) - if len(cleaned) <= 30: - return f"{cluster_name} node names: {', '.join(cleaned)}." - shown = ", ".join(cleaned[:30]) - return f"{cluster_name} node names: {shown}, … (+{len(cleaned) - 30} more)." - try: - data = k8s_get("/api/v1/nodes?limit=500") - except Exception: - return "" - items = data.get("items") or [] - if not isinstance(items, list) or not items: - return "" - names = [] - for node in items: - name = (node.get("metadata") or {}).get("name") or "" - if name: - names.append(name) - names = sorted(set(names)) - if not names: - return "" - if len(names) <= 30: - return f"{cluster_name} node names: {', '.join(names)}." - shown = ", ".join(names[:30]) - return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)." - - -def nodes_arch_summary(cluster_name: str, arch: str) -> str: - try: - data = k8s_get("/api/v1/nodes?limit=500") - except Exception: - return "" - items = data.get("items") or [] - if not isinstance(items, list) or not items: - return "" - normalized = (arch or "").strip().lower() - if normalized in ("aarch64", "arm64"): - arch_label = "arm64" - elif normalized in ("x86_64", "x86-64", "amd64"): - arch_label = "amd64" - else: - arch_label = normalized - total = 0 - for node in items: - labels = (node.get("metadata") or {}).get("labels") or {} - if labels.get("kubernetes.io/arch") == arch_label: - total += 1 - return f"{cluster_name} cluster has {total} {arch_label} nodes." - def _strip_code_fence(text: str) -> str: cleaned = (text or "").strip() match = CODE_FENCE_RE.match(cleaned) From 689bf10995d3ffe025a07aeda285c30f86e754b1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:39:01 -0300 Subject: [PATCH 264/416] comms: restart atlasbot for generic planner --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d5ad62e..d195e89 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-23 + checksum/atlasbot-configmap: manual-atlasbot-24 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From c8662a624e280b874d243a7f16ba376b0da36774 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:43:58 -0300 Subject: [PATCH 265/416] atlasbot: add internal endpoint and portal wiring --- .../bstein-dev-home/backend-deployment.yaml | 5 ++ services/comms/atlasbot-deployment.yaml | 7 ++- services/comms/atlasbot-service.yaml | 15 +++++ services/comms/kustomization.yaml | 1 + services/comms/scripts/atlasbot/bot.py | 58 +++++++++++++++++++ 5 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 services/comms/atlasbot-service.yaml diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index ecf478c..26c99e1 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -28,6 +28,7 @@ spec: {{ with secret "kv/data/atlas/shared/chat-ai-keys-runtime" }} export CHAT_KEY_MATRIX="{{ .Data.data.matrix }}" export CHAT_KEY_HOMEPAGE="{{ .Data.data.homepage }}" + export AI_ATLASBOT_TOKEN="{{ .Data.data.homepage }}" {{ end }} {{ with secret "kv/data/atlas/shared/portal-e2e-client" }} export PORTAL_E2E_CLIENT_ID="{{ .Data.data.client_id }}" @@ -66,6 +67,10 @@ spec: value: qwen2.5-coder:7b-instruct-q4_0 - name: AI_CHAT_TIMEOUT_SEC value: "480" + - name: AI_ATLASBOT_ENDPOINT + value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer + - name: AI_ATLASBOT_TIMEOUT_SEC + value: "5" - name: AI_NODE_NAME valueFrom: fieldRef: diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d195e89..c0596b6 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-24 + checksum/atlasbot-configmap: manual-atlasbot-25 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -87,6 +87,11 @@ spec: value: "480" - name: ATLASBOT_THINKING_INTERVAL_SEC value: "120" + - name: ATLASBOT_HTTP_PORT + value: "8090" + ports: + - name: http + containerPort: 8090 resources: requests: cpu: 100m diff --git a/services/comms/atlasbot-service.yaml b/services/comms/atlasbot-service.yaml new file mode 100644 index 0000000..c8b3570 --- /dev/null +++ b/services/comms/atlasbot-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: atlasbot + namespace: comms + labels: + app: atlasbot +spec: + selector: + app: atlasbot + ports: + - name: http + port: 8090 + targetPort: 8090 + type: ClusterIP diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml index 37f681d..410f2a6 100644 --- a/services/comms/kustomization.yaml +++ b/services/comms/kustomization.yaml @@ -14,6 +14,7 @@ resources: - guest-register-deployment.yaml - guest-register-service.yaml - atlasbot-deployment.yaml + - atlasbot-service.yaml - wellknown.yaml - atlasbot-rbac.yaml - mas-secrets-ensure-rbac.yaml diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 987df7a..deb8e62 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -5,6 +5,7 @@ import re import ssl import threading import time +from http.server import BaseHTTPRequestHandler, HTTPServer from typing import Any from urllib import error, parse, request @@ -1089,6 +1090,62 @@ def _normalize_reply(value: Any) -> str: return text +# Internal HTTP endpoint for cluster answers (website uses this). +class _AtlasbotHandler(BaseHTTPRequestHandler): + server_version = "AtlasbotHTTP/1.0" + + def _write_json(self, status: int, payload: dict[str, Any]): + body = json.dumps(payload).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _authorized(self) -> bool: + if not ATLASBOT_INTERNAL_TOKEN: + return True + token = self.headers.get("X-Internal-Token", "") + return token == ATLASBOT_INTERNAL_TOKEN + + def do_GET(self): # noqa: N802 + if self.path == "/health": + self._write_json(200, {"status": "ok"}) + return + self._write_json(404, {"error": "not_found"}) + + def do_POST(self): # noqa: N802 + if self.path != "/v1/answer": + self._write_json(404, {"error": "not_found"}) + return + if not self._authorized(): + self._write_json(401, {"error": "unauthorized"}) + return + try: + length = int(self.headers.get("Content-Length", "0")) + except ValueError: + length = 0 + raw = self.rfile.read(length) if length > 0 else b"" + try: + payload = json.loads(raw.decode("utf-8")) if raw else {} + except json.JSONDecodeError: + self._write_json(400, {"error": "invalid_json"}) + return + prompt = str(payload.get("prompt") or payload.get("question") or "").strip() + if not prompt: + self._write_json(400, {"error": "missing_prompt"}) + return + inventory = node_inventory_live() + answer = structured_answer(prompt, inventory=inventory, metrics_summary="") + self._write_json(200, {"answer": answer}) + + +def _start_http_server(): + server = HTTPServer(("0.0.0.0", ATLASBOT_HTTP_PORT), _AtlasbotHandler) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + + # Conversation state. history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] (short transcript) @@ -1326,6 +1383,7 @@ def login_with_retry(): def main(): load_kb() + _start_http_server() token = login_with_retry() try: room_id = resolve_alias(token, ROOM_ALIAS) From 328241b7ac136fc1bb92e0d26a235f12516fadae Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 01:47:43 +0000 Subject: [PATCH 266/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 90c3b8d..fe604b6 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From f08d740d834155f298a9c126865349831103ace5 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 01:47:47 +0000 Subject: [PATCH 267/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index fe604b6..f50c38b 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 0331e7ea99862e5db24569236bb94b3f514ad0f4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:50:12 -0300 Subject: [PATCH 268/416] monitoring: fix jetson metrics newlines --- services/monitoring/scripts/jetson_tegrastats_exporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index c4d3fa2..c237ec5 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -65,7 +65,7 @@ class Handler(http.server.BaseHTTPRequestHandler): for k, v in metrics.items(): out.append(f"# TYPE jetson_{k} gauge") out.append(f"jetson_{k}{label} {v}") - body = "\\n".join(out) + "\\n" + body = "\n".join(out) + "\n" self.send_response(200) self.send_header("Content-Type", "text/plain; version=0.0.4") self.send_header("Content-Length", str(len(body))) From 270dc939667cc7853bf2fbe9750a02fe758a9661 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:51:04 -0300 Subject: [PATCH 269/416] atlasbot: prioritize top queries over list --- services/comms/scripts/atlasbot/bot.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index deb8e62..e6c7542 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -335,7 +335,11 @@ def _has_any(text: str, phrases: tuple[str, ...]) -> bool: return any(p in text for p in phrases) def _detect_operation(q: str) -> str | None: + if _has_any(q, OPERATION_HINTS["top"]): + return "top" for op, phrases in OPERATION_HINTS.items(): + if op == "top": + continue if _has_any(q, phrases): return op return None From 9ea338b1219088a84d273fa1e236395a57cd5048 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:51:41 -0300 Subject: [PATCH 270/416] monitoring: restart jetson exporter --- services/monitoring/jetson-tegrastats-exporter.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index a835401..8584eba 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,6 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" + monitoring.bstein.dev/restart-rev: "1" spec: serviceAccountName: default hostPID: true From 66ce0caaf426e71efcc27225cca7349733b27929 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:52:49 -0300 Subject: [PATCH 271/416] comms: restart atlasbot for op priority --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index c0596b6..3ebb861 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-25 + checksum/atlasbot-configmap: manual-atlasbot-26 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From adc711be62f0ed0233f62be8b72ba5caa7d9fc1b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:54:43 -0300 Subject: [PATCH 272/416] comms: rerun synapse user seed --- services/comms/synapse-user-seed-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/synapse-user-seed-job.yaml b/services/comms/synapse-user-seed-job.yaml index 7fef796..aab88c3 100644 --- a/services/comms/synapse-user-seed-job.yaml +++ b/services/comms/synapse-user-seed-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: synapse-user-seed-7 + name: synapse-user-seed-8 namespace: comms spec: backoffLimit: 1 From d325111f34e9d310ad5b332134df3a9a20193aee Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 02:52:49 +0000 Subject: [PATCH 273/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f50c38b..d6208c4 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From d9c8632b8d742e762bd97148de2a30f3d1c9cb1e Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 02:53:50 +0000 Subject: [PATCH 274/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d6208c4..a520991 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 9ecdf054d3bcc08d0b6c66fba7a70c7f526e7c44 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 01:04:38 -0300 Subject: [PATCH 275/416] vault: bootstrap k8s auth config with root token --- services/vault/k8s-auth-config-cronjob.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml index 43da16b..5a2d682 100644 --- a/services/vault/k8s-auth-config-cronjob.yaml +++ b/services/vault/k8s-auth-config-cronjob.yaml @@ -34,6 +34,11 @@ spec: value: http://10.43.57.249:8200 - name: VAULT_K8S_ROLE value: vault-admin + - name: VAULT_TOKEN + valueFrom: + secretKeyRef: + name: vault-init + key: root_token - name: VAULT_K8S_TOKEN_REVIEWER_JWT_FILE value: /var/run/secrets/vault-token-reviewer/token - name: VAULT_K8S_ROLE_TTL From e24ff4782c5fbd653afbd4ade9c2668c0951ca30 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 01:14:42 -0300 Subject: [PATCH 276/416] comms: rerun ensure jobs and fix vault oidc env --- services/comms/comms-secrets-ensure-job.yaml | 2 +- services/comms/mas-local-users-ensure-job.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/services/comms/comms-secrets-ensure-job.yaml b/services/comms/comms-secrets-ensure-job.yaml index b71dd40..52904cc 100644 --- a/services/comms/comms-secrets-ensure-job.yaml +++ b/services/comms/comms-secrets-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: comms-secrets-ensure-6 + name: comms-secrets-ensure-7 namespace: comms spec: backoffLimit: 1 diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/mas-local-users-ensure-job.yaml index c8cf5f0..d385b47 100644 --- a/services/comms/mas-local-users-ensure-job.yaml +++ b/services/comms/mas-local-users-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: mas-local-users-ensure-16 + name: mas-local-users-ensure-17 namespace: comms spec: backoffLimit: 1 diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 33b8a12..6fa638d 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -89,7 +89,11 @@ spec: export VAULT_OIDC_USER_POLICIES="{{ .Data.data.user_policies }}" export VAULT_OIDC_REDIRECT_URIS="{{ .Data.data.redirect_uris }}" export VAULT_OIDC_BOUND_AUDIENCES="{{ .Data.data.bound_audiences }}" + {{- if .Data.data.bound_claims_type }} export VAULT_OIDC_BOUND_CLAIMS_TYPE="{{ .Data.data.bound_claims_type }}" + {{- else }} + export VAULT_OIDC_BOUND_CLAIMS_TYPE="string" + {{- end }} {{ end }} spec: serviceAccountName: ariadne From bab914c58f6508f985f45cbfed1292c76be70eb8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 01:19:43 -0300 Subject: [PATCH 277/416] comms: rerun mas local user ensure --- services/comms/mas-local-users-ensure-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/mas-local-users-ensure-job.yaml index d385b47..636ee5b 100644 --- a/services/comms/mas-local-users-ensure-job.yaml +++ b/services/comms/mas-local-users-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: mas-local-users-ensure-17 + name: mas-local-users-ensure-18 namespace: comms spec: backoffLimit: 1 From f7fc152439169b73978d8c67bd0bd06335dab0a9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 01:22:02 -0300 Subject: [PATCH 278/416] comms: rerun synapse seeder admin ensure --- services/comms/synapse-seeder-admin-ensure-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/synapse-seeder-admin-ensure-job.yaml index ce8ccd3..5d2d422 100644 --- a/services/comms/synapse-seeder-admin-ensure-job.yaml +++ b/services/comms/synapse-seeder-admin-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: synapse-seeder-admin-ensure-8 + name: synapse-seeder-admin-ensure-9 namespace: comms spec: backoffLimit: 2 From 5e4a9747333fcdf33f7d2ffd536666da73d18110 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 06:51:28 +0000 Subject: [PATCH 279/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 7528f6f..c8f9f2c 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-51 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-54 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 600c124ef29ed45a3990dfaf386abb8827048b0d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 03:56:00 -0300 Subject: [PATCH 280/416] atlasbot: clarify scoped metrics and format percent values --- services/comms/scripts/atlasbot/bot.py | 57 ++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e6c7542..f8b3ccf 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -406,15 +406,56 @@ def _apply_node_filter(expr: str, node_regex: str | None) -> str: replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}' return expr.replace(needle, replacement) +def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool: + exprs = entry.get("exprs") + expr = exprs[0] if isinstance(exprs, list) and exprs else "" + return "* 100" in expr or "*100" in expr + + +def _format_metric_value(value: str, *, percent: bool) -> str: + try: + num = float(value) + except (TypeError, ValueError): + return value + if percent: + return f"{num:.1f}%" + if abs(num) >= 1: + return f"{num:.2f}".rstrip("0").rstrip(".") + return f"{num:.4f}".rstrip("0").rstrip(".") + + +def _format_metric_label(metric: dict[str, Any]) -> str: + label_parts = [] + for k in ("namespace", "pod", "container", "node", "instance", "job", "phase"): + if metric.get(k): + label_parts.append(f"{k}={metric.get(k)}") + if not label_parts: + for k in sorted(metric.keys()): + if k.startswith("__"): + continue + label_parts.append(f"{k}={metric.get(k)}") + if len(label_parts) >= 4: + break + return ", ".join(label_parts) if label_parts else "series" + + def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str: series = _vm_value_series(res) panel = entry.get("panel_title") or "Metric" if not series: return "" - rendered = vm_render_result(res, limit=5) - if not rendered: + percent = _metric_expr_uses_percent(entry) + lines: list[str] = [] + for r in series[:5]: + if not isinstance(r, dict): + continue + metric = r.get("metric") or {} + value = r.get("value") or [] + val = value[1] if isinstance(value, list) and len(value) > 1 else "" + label = _format_metric_label(metric if isinstance(metric, dict) else {}) + lines.append(f"{label}: {_format_metric_value(val, percent=percent)}") + if not lines: return "" - lines = [line.lstrip("-").strip() for line in rendered.splitlines() if line.strip().startswith("-")] if len(lines) == 1: return f"{panel}: {lines[0]}." return f"{panel}:\n" + "\n".join(f"- {line}" for line in lines) @@ -627,6 +668,16 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s res = vm_query(expr, timeout=20) answer = _format_metric_answer(entry, res) if answer: + scope_parts: list[str] = [] + if include_hw: + scope_parts.append(" and ".join(sorted(include_hw))) + if exclude_hw: + scope_parts.append(f"excluding {' and '.join(sorted(exclude_hw))}") + if only_workers: + scope_parts.append("worker") + if scope_parts: + scope = " ".join(scope_parts) + return f"Among {scope} nodes, {answer}" return answer if metrics_summary: return metrics_summary From 39fd7adb5550365261dde70d1e7b0b46514f036a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 03:56:47 -0300 Subject: [PATCH 281/416] comms: restart atlasbot for metrics formatting --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 3ebb861..83e0b2e 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-26 + checksum/atlasbot-configmap: manual-atlasbot-27 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 0ef14c67fd4b92ecc51d35a5a9f0dee1a61ba593 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:48:44 -0300 Subject: [PATCH 282/416] comms: add synapse admin ensure job --- services/comms/kustomization.yaml | 1 + services/comms/synapse-admin-ensure-job.yaml | 177 ++++++++++++++++++ services/maintenance/ariadne-deployment.yaml | 3 + .../vault/scripts/vault_k8s_auth_configure.sh | 4 +- 4 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 services/comms/synapse-admin-ensure-job.yaml diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml index 410f2a6..01d7be5 100644 --- a/services/comms/kustomization.yaml +++ b/services/comms/kustomization.yaml @@ -25,6 +25,7 @@ resources: - mas-admin-client-secret-ensure-job.yaml - mas-db-ensure-job.yaml - comms-secrets-ensure-job.yaml + - synapse-admin-ensure-job.yaml - synapse-signingkey-ensure-job.yaml - synapse-seeder-admin-ensure-job.yaml - synapse-user-seed-job.yaml diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/synapse-admin-ensure-job.yaml new file mode 100644 index 0000000..be9e0fd --- /dev/null +++ b/services/comms/synapse-admin-ensure-job.yaml @@ -0,0 +1,177 @@ +# services/comms/synapse-admin-ensure-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: synapse-admin-ensure-1 + namespace: comms +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + spec: + serviceAccountName: comms-secrets-ensure + restartPolicy: OnFailure + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: kubernetes.io/arch + operator: In + values: ["arm64"] + containers: + - name: ensure + image: python:3.11-slim + env: + - name: VAULT_ADDR + value: http://vault.vault.svc.cluster.local:8200 + - name: VAULT_ROLE + value: comms-secrets + - name: SYNAPSE_ADMIN_URL + value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008 + command: + - /bin/sh + - -c + - | + set -euo pipefail + python - <<'PY' + import base64 + import hashlib + import hmac + import json + import os + import secrets + import string + import urllib.error + import urllib.request + + VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/") + VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets") + SYNAPSE_ADMIN_URL = os.environ.get( + "SYNAPSE_ADMIN_URL", + "http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008", + ).rstrip("/") + SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" + + def log(msg: str) -> None: + print(msg, flush=True) + + def request_json(url: str, payload: dict | None = None) -> dict: + data = None + headers = {"Content-Type": "application/json"} + if payload is not None: + data = json.dumps(payload).encode("utf-8") + req = urllib.request.Request(url, data=data, headers=headers, method="POST" if data else "GET") + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read().decode("utf-8")) + + def vault_login() -> str: + with open(SA_TOKEN_PATH, "r", encoding="utf-8") as f: + jwt = f.read().strip() + payload = {"jwt": jwt, "role": VAULT_ROLE} + resp = request_json(f"{VAULT_ADDR}/v1/auth/kubernetes/login", payload) + token = resp.get("auth", {}).get("client_token") + if not token: + raise RuntimeError("vault login failed") + return token + + def vault_get(token: str, path: str) -> dict: + req = urllib.request.Request( + f"{VAULT_ADDR}/v1/kv/data/atlas/{path}", + headers={"X-Vault-Token": token}, + ) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + payload = json.loads(resp.read().decode("utf-8")) + return payload.get("data", {}).get("data", {}) + except urllib.error.HTTPError as exc: + if exc.code == 404: + return {} + raise + + def vault_put(token: str, path: str, data: dict) -> None: + payload = {"data": data} + req = urllib.request.Request( + f"{VAULT_ADDR}/v1/kv/data/atlas/{path}", + data=json.dumps(payload).encode("utf-8"), + headers={"X-Vault-Token": token, "Content-Type": "application/json"}, + method="POST", + ) + with urllib.request.urlopen(req, timeout=30) as resp: + resp.read() + + def random_password(length: int = 32) -> str: + alphabet = string.ascii_letters + string.digits + return "".join(secrets.choice(alphabet) for _ in range(length)) + + def ensure_registration_secret(token: str) -> str: + data = vault_get(token, "comms/synapse-registration") + secret = (data.get("registration_shared_secret") or "").strip() + if not secret: + secret = secrets.token_urlsafe(32) + data["registration_shared_secret"] = secret + vault_put(token, "comms/synapse-registration", data) + log("registration secret created") + return secret + + def ensure_admin_creds(token: str) -> dict: + data = vault_get(token, "comms/synapse-admin") + username = (data.get("username") or "").strip() or "synapse-admin" + password = (data.get("password") or "").strip() + if not password: + password = random_password() + data["username"] = username + data["password"] = password + vault_put(token, "comms/synapse-admin", data) + return data + + def register_admin(secret: str, username: str, password: str) -> str: + nonce_payload = request_json(f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register") + nonce = nonce_payload.get("nonce") + if not nonce: + raise RuntimeError("synapse register nonce missing") + admin_flag = "admin" + user_type = "" + mac_payload = "\x00".join([nonce, username, password, admin_flag, user_type]) + mac = hmac.new(secret.encode("utf-8"), mac_payload.encode("utf-8"), hashlib.sha1).hexdigest() + payload = { + "nonce": nonce, + "username": username, + "password": password, + "admin": True, + "mac": mac, + } + req = urllib.request.Request( + f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register", + data=json.dumps(payload).encode("utf-8"), + headers={"Content-Type": "application/json"}, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + payload = json.loads(resp.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + body = exc.read().decode("utf-8") + raise RuntimeError(f"synapse admin register failed: {exc.code} {body}") from exc + access_token = payload.get("access_token") + if not access_token: + raise RuntimeError("synapse admin token missing") + return access_token + + vault_token = vault_login() + reg_secret = ensure_registration_secret(vault_token) + admin_data = ensure_admin_creds(vault_token) + if admin_data.get("access_token"): + log("synapse admin token already present") + raise SystemExit(0) + access_token = register_admin(reg_secret, admin_data["username"], admin_data["password"]) + admin_data["access_token"] = access_token + vault_put(vault_token, "comms/synapse-admin", admin_data) + log("synapse admin user ensured") + PY diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 6fa638d..fce1ded 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -69,6 +69,9 @@ spec: export COMMS_BOT_PASSWORD="{{ index .Data.data "bot-password" }}" export COMMS_SEEDER_PASSWORD="{{ index .Data.data "seeder-password" }}" {{ end }} + {{ with secret "kv/data/atlas/comms/synapse-admin" }} + export COMMS_SYNAPSE_ADMIN_TOKEN="{{ .Data.data.access_token }}" + {{ end }} {{ with secret "kv/data/atlas/comms/synapse-db" }} export COMMS_SYNAPSE_DB_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}" {{ end }} diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index 21132c7..0212180 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ - "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" "" + "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ @@ -253,4 +253,4 @@ write_policy_and_role "crypto-secrets" "crypto" "crypto-secrets-ensure" \ write_policy_and_role "comms-secrets" "comms" \ "comms-secrets-ensure,mas-db-ensure,mas-admin-client-secret-writer,othrys-synapse-signingkey-job" \ "" \ - "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon comms/atlasbot-credentials-runtime comms/synapse-db comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey" + "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin comms/synapse-registration comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey" From c219019ad5a08d2fccc7cac567d750f856edbaac Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:51:20 -0300 Subject: [PATCH 283/416] atlasbot: add knowledge summaries and better fallback --- services/comms/scripts/atlasbot/bot.py | 110 +++++++++++++++++++++++-- 1 file changed, 103 insertions(+), 7 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f8b3ccf..3a1a000 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -254,14 +254,14 @@ def load_kb(): _NAME_INDEX = names _METRIC_INDEX = metrics if isinstance(metrics, list) else [] -def kb_retrieve(query: str, *, limit: int = 3) -> str: +def _score_kb_docs(query: str) -> list[dict[str, Any]]: q = (query or "").strip() if not q or not KB.get("runbooks"): - return "" + return [] ql = q.lower() q_tokens = _tokens(q) if not q_tokens: - return "" + return [] scored: list[tuple[int, dict]] = [] for doc in KB.get("runbooks", []): @@ -281,9 +281,16 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: score += 4 if score: scored.append((score, doc)) - scored.sort(key=lambda x: x[0], reverse=True) - picked = [d for _, d in scored[:limit]] + return [d for _, d in scored] + + +def kb_retrieve(query: str, *, limit: int = 3) -> str: + q = (query or "").strip() + if not q: + return "" + scored = _score_kb_docs(q) + picked = scored[:limit] if not picked: return "" @@ -301,6 +308,22 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: used += len(chunk) return "\n".join(parts).strip() + +def kb_retrieve_titles(query: str, *, limit: int = 4) -> str: + scored = _score_kb_docs(query) + picked = scored[:limit] + if not picked: + return "" + parts = ["Relevant runbooks:"] + for doc in picked: + title = doc.get("title") or doc.get("path") or "runbook" + path = doc.get("path") or "" + if path: + parts.append(f"- {title} ({path})") + else: + parts.append(f"- {title}") + return "\n".join(parts) + def _extract_titan_nodes(text: str) -> list[str]: cleaned = normalize_query(text) names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n} @@ -439,6 +462,18 @@ def _format_metric_label(metric: dict[str, Any]) -> str: return ", ".join(label_parts) if label_parts else "series" +def _primary_series_metric(res: dict | None) -> tuple[str | None, str | None]: + series = _vm_value_series(res or {}) + if not series: + return (None, None) + first = series[0] + metric = first.get("metric") if isinstance(first, dict) else {} + value = first.get("value") if isinstance(first, dict) else [] + node = metric.get("node") if isinstance(metric, dict) else None + val = value[1] if isinstance(value, list) and len(value) > 1 else None + return (node, val) + + def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str: series = _vm_value_series(res) panel = entry.get("panel_title") or "Metric" @@ -677,7 +712,15 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s scope_parts.append("worker") if scope_parts: scope = " ".join(scope_parts) - return f"Among {scope} nodes, {answer}" + overall_note = "" + base_res = vm_query(entry["exprs"][0], timeout=20) + base_node, base_val = _primary_series_metric(base_res) + scoped_node, scoped_val = _primary_series_metric(res) + if base_node and scoped_node and base_node != scoped_node: + percent = _metric_expr_uses_percent(entry) + base_val_fmt = _format_metric_value(base_val or "", percent=percent) + overall_note = f" Overall hottest node: {base_node} ({base_val_fmt})." + return f"Among {scope} nodes, {answer}{overall_note}" return answer if metrics_summary: return metrics_summary @@ -1075,7 +1118,7 @@ def _context_fallback(context: str) -> str: trimmed = context.strip() if len(trimmed) > MAX_TOOL_CHARS: trimmed = trimmed[: MAX_TOOL_CHARS - 3].rstrip() + "..." - return "I couldn’t reach the model backend. Here is the data I found:\n" + trimmed + return "Here is what I found:\n" + trimmed def vm_top_restarts(hours: int = 1) -> str: q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))" @@ -1192,6 +1235,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): return inventory = node_inventory_live() answer = structured_answer(prompt, inventory=inventory, metrics_summary="") + if not answer and _knowledge_intent(prompt): + answer = knowledge_summary(prompt, inventory) + if not answer: + kb = kb_retrieve_titles(prompt, limit=4) + answer = kb or "" self._write_json(200, {"answer": answer}) @@ -1257,6 +1305,48 @@ def build_context( return "\n\n".join([p for p in parts if p]).strip() + +def _knowledge_intent(prompt: str) -> bool: + q = normalize_query(prompt) + return any( + phrase in q + for phrase in ( + "what do you know", + "tell me about", + "overview", + "summary", + "describe", + "explain", + "what is", + ) + ) + + +def _inventory_summary(inventory: list[dict[str, Any]]) -> str: + if not inventory: + return "" + groups = _group_nodes(inventory) + total = len(inventory) + ready = [n for n in inventory if n.get("ready") is True] + not_ready = [n for n in inventory if n.get("ready") is False] + parts = [f"Atlas cluster: {total} nodes ({len(ready)} ready, {len(not_ready)} not ready)."] + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + nodes = groups.get(key) or [] + if nodes: + parts.append(f"- {key}: {len(nodes)} nodes ({', '.join(nodes)})") + return "\n".join(parts) + + +def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str: + parts: list[str] = [] + inv = _inventory_summary(inventory) + if inv: + parts.append(inv) + kb_titles = kb_retrieve_titles(prompt, limit=4) + if kb_titles: + parts.append(kb_titles) + return "\n".join(parts).strip() + def _ollama_call(hist_key, prompt: str, *, context: str) -> str: system = ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " @@ -1416,6 +1506,12 @@ def sync_loop(token: str, room_id: str): send_msg(token, rid, structured) continue + if _knowledge_intent(body): + summary = knowledge_summary(body, inventory) + if summary: + send_msg(token, rid, summary) + continue + reply = ollama_reply_with_thinking( token, rid, From 67b9babc0ed813c184db67f5c4c9bd09821125a9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:51:33 -0300 Subject: [PATCH 284/416] comms: restart atlasbot for knowledge summaries --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 83e0b2e..5198f2a 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-27 + checksum/atlasbot-configmap: manual-atlasbot-28 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From d6b9d64e7004e95f4d86fc4ae0c60c6439d8abca Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:53:33 -0300 Subject: [PATCH 285/416] atlasbot: scope overall hottest node to atlas inventory --- services/comms/scripts/atlasbot/bot.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 3a1a000..8df1317 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -713,7 +713,12 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s if scope_parts: scope = " ".join(scope_parts) overall_note = "" - base_res = vm_query(entry["exprs"][0], timeout=20) + base_expr = entry["exprs"][0] + if inventory: + all_nodes = "|".join([n["name"] for n in inventory]) + if all_nodes: + base_expr = _apply_node_filter(base_expr, all_nodes) + base_res = vm_query(base_expr, timeout=20) base_node, base_val = _primary_series_metric(base_res) scoped_node, scoped_val = _primary_series_metric(res) if base_node and scoped_node and base_node != scoped_node: From 11ba37a4b2d7d79f3d400d86880021809c107ffa Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:53:44 -0300 Subject: [PATCH 286/416] comms: restart atlasbot for scoped hottest --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 5198f2a..e35fa61 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-28 + checksum/atlasbot-configmap: manual-atlasbot-29 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 292d513e103b8570e254b3cffefdad709d20c85e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:58:13 -0300 Subject: [PATCH 287/416] comms: ensure synapse admin token --- services/comms/synapse-admin-ensure-job.yaml | 141 ++++++++++++------- 1 file changed, 89 insertions(+), 52 deletions(-) diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/synapse-admin-ensure-job.yaml index be9e0fd..6ddea83 100644 --- a/services/comms/synapse-admin-ensure-job.yaml +++ b/services/comms/synapse-admin-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: synapse-admin-ensure-1 + name: synapse-admin-ensure-2 namespace: comms spec: backoffLimit: 1 @@ -40,24 +40,26 @@ spec: - -c - | set -euo pipefail + pip install --no-cache-dir psycopg2-binary bcrypt >/dev/null python - <<'PY' - import base64 - import hashlib - import hmac import json import os import secrets import string + import time import urllib.error import urllib.request + import bcrypt + import psycopg2 + VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/") VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets") - SYNAPSE_ADMIN_URL = os.environ.get( - "SYNAPSE_ADMIN_URL", - "http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008", - ).rstrip("/") SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" + PGHOST = "postgres-service.postgres.svc.cluster.local" + PGPORT = 5432 + PGDATABASE = "synapse" + PGUSER = "synapse" def log(msg: str) -> None: print(msg, flush=True) @@ -110,16 +112,6 @@ spec: alphabet = string.ascii_letters + string.digits return "".join(secrets.choice(alphabet) for _ in range(length)) - def ensure_registration_secret(token: str) -> str: - data = vault_get(token, "comms/synapse-registration") - secret = (data.get("registration_shared_secret") or "").strip() - if not secret: - secret = secrets.token_urlsafe(32) - data["registration_shared_secret"] = secret - vault_put(token, "comms/synapse-registration", data) - log("registration secret created") - return secret - def ensure_admin_creds(token: str) -> dict: data = vault_get(token, "comms/synapse-admin") username = (data.get("username") or "").strip() or "synapse-admin" @@ -131,47 +123,92 @@ spec: vault_put(token, "comms/synapse-admin", data) return data - def register_admin(secret: str, username: str, password: str) -> str: - nonce_payload = request_json(f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register") - nonce = nonce_payload.get("nonce") - if not nonce: - raise RuntimeError("synapse register nonce missing") - admin_flag = "admin" - user_type = "" - mac_payload = "\x00".join([nonce, username, password, admin_flag, user_type]) - mac = hmac.new(secret.encode("utf-8"), mac_payload.encode("utf-8"), hashlib.sha1).hexdigest() - payload = { - "nonce": nonce, - "username": username, - "password": password, - "admin": True, - "mac": mac, + def ensure_user(cur, cols, user_id, password, admin): + now_ms = int(time.time() * 1000) + values = { + "name": user_id, + "password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(), + "creation_ts": now_ms, } - req = urllib.request.Request( - f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register", - data=json.dumps(payload).encode("utf-8"), - headers={"Content-Type": "application/json"}, - method="POST", + + def add_flag(name, flag): + if name not in cols: + return + if cols[name]["type"] in ("smallint", "integer"): + values[name] = int(flag) + else: + values[name] = bool(flag) + + add_flag("admin", admin) + add_flag("deactivated", False) + add_flag("shadow_banned", False) + add_flag("is_guest", False) + + columns = list(values.keys()) + placeholders = ", ".join(["%s"] * len(columns)) + updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"]) + query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};" + cur.execute(query, [values[c] for c in columns]) + + def get_cols(cur): + cur.execute( + """ + SELECT column_name, is_nullable, column_default, data_type + FROM information_schema.columns + WHERE table_schema = 'public' AND table_name = 'users' + """ + ) + cols = {} + for name, is_nullable, default, data_type in cur.fetchall(): + cols[name] = { + "nullable": is_nullable == "YES", + "default": default, + "type": data_type, + } + return cols + + def ensure_access_token(cur, user_id, token_value): + cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens") + token_id = cur.fetchone()[0] + cur.execute( + """ + INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms) + VALUES (%s, %s, %s, %s, NULL) + ON CONFLICT (token) DO NOTHING + """, + (token_id, user_id, token_value, "ariadne-admin"), ) - try: - with urllib.request.urlopen(req, timeout=30) as resp: - payload = json.loads(resp.read().decode("utf-8")) - except urllib.error.HTTPError as exc: - body = exc.read().decode("utf-8") - raise RuntimeError(f"synapse admin register failed: {exc.code} {body}") from exc - access_token = payload.get("access_token") - if not access_token: - raise RuntimeError("synapse admin token missing") - return access_token vault_token = vault_login() - reg_secret = ensure_registration_secret(vault_token) admin_data = ensure_admin_creds(vault_token) if admin_data.get("access_token"): log("synapse admin token already present") raise SystemExit(0) - access_token = register_admin(reg_secret, admin_data["username"], admin_data["password"]) - admin_data["access_token"] = access_token + + synapse_db = vault_get(vault_token, "comms/synapse-db") + pg_password = synapse_db.get("POSTGRES_PASSWORD") + if not pg_password: + raise RuntimeError("synapse db password missing") + + user_id = f"@{admin_data['username']}:live.bstein.dev" + conn = psycopg2.connect( + host=PGHOST, + port=PGPORT, + dbname=PGDATABASE, + user=PGUSER, + password=pg_password, + ) + token_value = secrets.token_urlsafe(32) + try: + with conn: + with conn.cursor() as cur: + cols = get_cols(cur) + ensure_user(cur, cols, user_id, admin_data["password"], True) + ensure_access_token(cur, user_id, token_value) + finally: + conn.close() + + admin_data["access_token"] = token_value vault_put(vault_token, "comms/synapse-admin", admin_data) - log("synapse admin user ensured") + log("synapse admin token stored") PY From 2dc208e919aad757dfb60fb20a53e912a3e5a7d2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 05:02:02 -0300 Subject: [PATCH 288/416] comms: retain synapse admin ensure logs --- services/comms/synapse-admin-ensure-job.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/synapse-admin-ensure-job.yaml index 6ddea83..5ddf60c 100644 --- a/services/comms/synapse-admin-ensure-job.yaml +++ b/services/comms/synapse-admin-ensure-job.yaml @@ -2,15 +2,15 @@ apiVersion: batch/v1 kind: Job metadata: - name: synapse-admin-ensure-2 + name: synapse-admin-ensure-3 namespace: comms spec: - backoffLimit: 1 + backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: spec: serviceAccountName: comms-secrets-ensure - restartPolicy: OnFailure + restartPolicy: Never affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -40,7 +40,7 @@ spec: - -c - | set -euo pipefail - pip install --no-cache-dir psycopg2-binary bcrypt >/dev/null + pip install --no-cache-dir psycopg2-binary bcrypt python - <<'PY' import json import os From b1aad04f3e78f528578d9b560273239a687b032f Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 08:14:36 +0000 Subject: [PATCH 289/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index c8f9f2c..1392855 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-54 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-56 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 89935a579ab8cdb53175d32ec1a54fdfc63561b7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 05:41:58 -0300 Subject: [PATCH 290/416] atlasbot: use cluster snapshot + model update --- services/ai-llm/deployment.yaml | 4 +- services/comms/atlasbot-deployment.yaml | 6 +- services/comms/scripts/atlasbot/bot.py | 368 +++++++++++++++++++++--- 3 files changed, 334 insertions(+), 44 deletions(-) diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml index 4f34d86..43d14c8 100644 --- a/services/ai-llm/deployment.yaml +++ b/services/ai-llm/deployment.yaml @@ -20,7 +20,7 @@ spec: labels: app: ollama annotations: - ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0 + ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0 ai.bstein.dev/gpu: GPU pool (titan-22/24) ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z" spec: @@ -52,7 +52,7 @@ spec: - name: OLLAMA_MODELS value: /root/.ollama - name: OLLAMA_MODEL - value: qwen2.5-coder:7b-instruct-q4_0 + value: qwen2.5:7b-instruct-q4_0 command: - /bin/sh - -c diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index e35fa61..0ee86f0 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -82,11 +82,13 @@ spec: - name: OLLAMA_URL value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL - value: qwen2.5-coder:7b-instruct-q4_0 + value: qwen2.5:7b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC - value: "480" + value: "600" - name: ATLASBOT_THINKING_INTERVAL_SEC value: "120" + - name: ATLASBOT_SNAPSHOT_TTL_SEC + value: "30" - name: ATLASBOT_HTTP_PORT value: "8090" ports: diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 8df1317..9f6c38d 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -21,6 +21,7 @@ API_KEY = os.environ.get("CHAT_API_KEY", "") OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090")) ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "") +SNAPSHOT_TTL_SEC = int(os.environ.get("ATLASBOT_SNAPSHOT_TTL_SEC", "30")) KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") @@ -523,7 +524,7 @@ def _hardware_match(node: dict[str, Any], filters: set[str]) -> bool: hw = node.get("hardware") or "" arch = node.get("arch") or "" for f in filters: - if f == "rpi" and hw in ("rpi4", "rpi5"): + if f == "rpi" and hw in ("rpi4", "rpi5", "rpi"): return True if f == "arm64" and arch == "arm64": return True @@ -546,7 +547,7 @@ def _hardware_class(labels: dict[str, Any]) -> str: if str(labels.get("jetson") or "").lower() == "true": return "jetson" hardware = (labels.get("hardware") or "").strip().lower() - if hardware in ("rpi4", "rpi5"): + if hardware in ("rpi4", "rpi5", "rpi"): return hardware arch = labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "" if arch == "amd64": @@ -580,6 +581,14 @@ def node_inventory_live() -> list[dict[str, Any]]: ) return sorted(inventory, key=lambda item: item["name"]) + +def node_inventory() -> list[dict[str, Any]]: + snapshot = _snapshot_state() + inventory = _snapshot_inventory(snapshot) + if inventory: + return inventory + return node_inventory_live() + def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: grouped: dict[str, list[str]] = collections.defaultdict(list) for node in inventory: @@ -591,7 +600,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")): return "" if inventory is None: - inventory = node_inventory_live() + inventory = node_inventory() if not inventory: return "" groups = _group_nodes(inventory) @@ -626,7 +635,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]: q = normalize_query(prompt) if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")): - return node_inventory_live() + return node_inventory() return [] def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: @@ -656,11 +665,177 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: "expected_missing": sorted(expected_missing), } -def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str: + +def _workload_tokens(entry: dict[str, Any]) -> set[str]: + tokens: set[str] = set() + for key in ("workload", "namespace"): + value = entry.get(key) + if isinstance(value, str) and value: + tokens.update(_tokens(value)) + return tokens + + +def _select_workload(prompt: str, workloads: list[dict[str, Any]]) -> dict[str, Any] | None: + q_tokens = set(_tokens(prompt)) + if not q_tokens: + return None + scored: list[tuple[int, dict[str, Any]]] = [] + for entry in workloads: + if not isinstance(entry, dict): + continue + tokens = _workload_tokens(entry) + score = len(tokens & q_tokens) + if score: + scored.append((score, entry)) + if not scored: + return None + scored.sort(key=lambda item: item[0], reverse=True) + return scored[0][1] + + +def _format_confidence(answer: str, confidence: str) -> str: + if not answer: + return "" + return f"{answer}\nConfidence: {confidence}." + + +def workload_answer(prompt: str, workloads: list[dict[str, Any]]) -> str: + q = normalize_query(prompt) + if not any(word in q for word in ("where", "which", "node", "run", "running", "host", "located")): + return "" + entry = _select_workload(prompt, workloads) + if not entry: + return "" + workload = entry.get("workload") or "" + namespace = entry.get("namespace") or "" + nodes = entry.get("nodes") if isinstance(entry.get("nodes"), dict) else {} + primary = entry.get("primary_node") or "" + if not workload or not nodes: + return "" + parts = [] + if primary: + parts.append(f"{primary} (primary)") + for node, count in sorted(nodes.items(), key=lambda item: (-item[1], item[0])): + if node == primary: + continue + parts.append(f"{node} ({count} pod{'s' if count != 1 else ''})") + node_text = ", ".join(parts) if parts else primary + answer = f"{workload} runs in {namespace}. Nodes: {node_text}." + return _format_confidence(answer, "medium") + + +def _snapshot_metrics(snapshot: dict[str, Any] | None) -> dict[str, Any]: + if not snapshot: + return {} + metrics = snapshot.get("metrics") + return metrics if isinstance(metrics, dict) else {} + + +def _node_usage_top( + usage: list[dict[str, Any]], + *, + allowed_nodes: set[str] | None, +) -> tuple[str, float] | None: + best_node = "" + best_val = None + for item in usage if isinstance(usage, list) else []: + if not isinstance(item, dict): + continue + node = item.get("node") or "" + if allowed_nodes and node not in allowed_nodes: + continue + value = item.get("value") + try: + numeric = float(value) + except (TypeError, ValueError): + continue + if best_val is None or numeric > best_val: + best_val = numeric + best_node = node + if best_node and best_val is not None: + return best_node, best_val + return None + + +def snapshot_metric_answer( + prompt: str, + *, + snapshot: dict[str, Any] | None, + inventory: list[dict[str, Any]], +) -> str: + if not snapshot: + return "" + metrics = _snapshot_metrics(snapshot) + if not metrics: + return "" + q = normalize_query(prompt) + metric = _detect_metric(q) + op = _detect_operation(q) + include_hw, exclude_hw = _detect_hardware_filters(q) + nodes_in_query = _extract_titan_nodes(q) + only_workers = "worker" in q or "workers" in q + + filtered = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=only_workers, + only_ready=None, + nodes_in_query=nodes_in_query, + ) + allowed_nodes = {node["name"] for node in filtered} if filtered else None + + if metric in {"cpu", "ram", "net", "io"} and op in {"top", "status", None}: + usage = metrics.get("node_usage", {}).get(metric, []) + top = _node_usage_top(usage, allowed_nodes=allowed_nodes) + if top: + node, val = top + percent = metric in {"cpu", "ram"} + value = _format_metric_value(str(val), percent=percent) + scope = "" + if include_hw: + scope = f" among {' and '.join(sorted(include_hw))}" + answer = f"Hottest node{scope}: {node} ({value})." + return _format_confidence(answer, "high") + + if metric == "connections" or "postgres" in q: + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} + used = postgres.get("used") + max_conn = postgres.get("max") + hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} + parts: list[str] = [] + if used is not None and max_conn is not None: + parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.") + if hottest.get("label"): + hot_val = hottest.get("value") + hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else "" + parts.append(f"Hottest DB: {hottest.get('label')} ({hot_val_str}).") + if parts: + return _format_confidence(" ".join(parts), "high") + + return "" + +def structured_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + metrics_summary: str, + snapshot: dict[str, Any] | None = None, + workloads: list[dict[str, Any]] | None = None, +) -> str: q = normalize_query(prompt) if not q: return "" + if workloads: + workload_resp = workload_answer(prompt, workloads) + if workload_resp: + return workload_resp + + snap_resp = snapshot_metric_answer(prompt, snapshot=snapshot, inventory=inventory) + if snap_resp: + return snap_resp + tokens = _tokens(q) op = _detect_operation(q) metric = _detect_metric(q) @@ -749,11 +924,20 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s if op == "status": if "missing" in q and expected_workers: missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) - return "Missing nodes: " + (", ".join(missing) if missing else "none") + "." + return _format_confidence( + "Missing nodes: " + (", ".join(missing) if missing else "none") + ".", + "high", + ) if only_ready is False: - return "Not ready nodes: " + (", ".join(names) if names else "none") + "." + return _format_confidence( + "Not ready nodes: " + (", ".join(names) if names else "none") + ".", + "high", + ) if only_ready is True: - return f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + "." + return _format_confidence( + f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + ".", + "high", + ) if op == "count": if expected_workers and ("expected" in q or "should" in q): @@ -761,10 +945,10 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." if missing: msg += f" Missing: {', '.join(missing)}." - return msg + return _format_confidence(msg, "high") if not (include_hw or exclude_hw or nodes_in_query or only_workers): - return f"Atlas has {len(names)} nodes." - return f"Matching nodes: {len(names)}." + return _format_confidence(f"Atlas has {len(names)} nodes.", "high") + return _format_confidence(f"Matching nodes: {len(names)}.", "high") if op == "list": if nodes_in_query: @@ -772,12 +956,12 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s existing = {n["name"] for n in inventory} for node in nodes_in_query: parts.append(f"{node}: {'present' if node in existing else 'not present'}") - return "Node presence: " + ", ".join(parts) + "." + return _format_confidence("Node presence: " + ", ".join(parts) + ".", "high") if not names: - return "Matching nodes: none." + return _format_confidence("Matching nodes: none.", "high") shown = names[:30] suffix = f", … (+{len(names) - 30} more)" if len(names) > 30 else "" - return "Matching nodes: " + ", ".join(shown) + suffix + "." + return _format_confidence("Matching nodes: " + ", ".join(shown) + suffix + ".", "high") return "" @@ -922,6 +1106,58 @@ def _ariadne_state(timeout: int = 5) -> dict | None: except Exception: return None + +_SNAPSHOT_CACHE: dict[str, Any] = {"payload": None, "ts": 0.0} + + +def _snapshot_state() -> dict[str, Any] | None: + now = time.monotonic() + cached = _SNAPSHOT_CACHE.get("payload") + ts = _SNAPSHOT_CACHE.get("ts") or 0.0 + if cached and now - ts < max(5, SNAPSHOT_TTL_SEC): + return cached + payload = _ariadne_state(timeout=10) + if isinstance(payload, dict) and payload: + _SNAPSHOT_CACHE["payload"] = payload + _SNAPSHOT_CACHE["ts"] = now + return payload + return cached if isinstance(cached, dict) else None + + +def _snapshot_inventory(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]: + if not snapshot: + return [] + items = snapshot.get("nodes_detail") + if not isinstance(items, list): + return [] + inventory: list[dict[str, Any]] = [] + for node in items: + if not isinstance(node, dict): + continue + labels = node.get("labels") if isinstance(node.get("labels"), dict) else {} + name = node.get("name") or "" + if not name: + continue + hardware = node.get("hardware") or _hardware_class(labels) + inventory.append( + { + "name": name, + "arch": node.get("arch") or labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "", + "hardware": hardware, + "roles": node.get("roles") or [], + "is_worker": node.get("is_worker") is True, + "ready": node.get("ready") is True, + } + ) + return sorted(inventory, key=lambda item: item["name"]) + + +def _snapshot_workloads(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]: + if not snapshot: + return [] + workloads = snapshot.get("workloads") + return workloads if isinstance(workloads, list) else [] + def k8s_pods(namespace: str) -> list[dict]: data = k8s_get(f"/api/v1/namespaces/{parse.quote(namespace)}/pods?limit=500") items = data.get("items") or [] @@ -1079,25 +1315,11 @@ def _node_is_worker(node: dict) -> bool: return True return True -def worker_nodes_status() -> tuple[list[str], list[str]]: - try: - data = k8s_get("/api/v1/nodes?limit=500") - except Exception: - return ([], []) - items = data.get("items") or [] - ready_nodes: list[str] = [] - not_ready_nodes: list[str] = [] - for node in items if isinstance(items, list) else []: - if not _node_is_worker(node): - continue - name = (node.get("metadata") or {}).get("name") or "" - if not name: - continue - ready = _node_ready_status(node) - if ready is True: - ready_nodes.append(name) - elif ready is False: - not_ready_nodes.append(name) +def worker_nodes_status(inventory: list[dict[str, Any]] | None = None) -> tuple[list[str], list[str]]: + if inventory is None: + inventory = node_inventory() + ready_nodes = [n["name"] for n in inventory if n.get("is_worker") and n.get("ready") is True] + not_ready_nodes = [n["name"] for n in inventory if n.get("is_worker") and n.get("ready") is False] return (sorted(ready_nodes), sorted(not_ready_nodes)) def expected_worker_nodes_from_metrics() -> list[str]: @@ -1238,13 +1460,29 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): if not prompt: self._write_json(400, {"error": "missing_prompt"}) return - inventory = node_inventory_live() - answer = structured_answer(prompt, inventory=inventory, metrics_summary="") + snapshot = _snapshot_state() + inventory = _snapshot_inventory(snapshot) or node_inventory_live() + workloads = _snapshot_workloads(snapshot) + answer = structured_answer( + prompt, + inventory=inventory, + metrics_summary="", + snapshot=snapshot, + workloads=workloads, + ) if not answer and _knowledge_intent(prompt): answer = knowledge_summary(prompt, inventory) if not answer: kb = kb_retrieve_titles(prompt, limit=4) - answer = kb or "" + context = build_context( + prompt, + allow_tools=False, + targets=[], + inventory=inventory, + snapshot=snapshot, + ) + fallback = kb or "I don't have enough data to answer that." + answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback) self._write_json(200, {"answer": answer}) @@ -1266,6 +1504,7 @@ def build_context( allow_tools: bool, targets: list[tuple[str, str]], inventory: list[dict[str, Any]] | None = None, + snapshot: dict[str, Any] | None = None, ) -> str: parts: list[str] = [] @@ -1281,6 +1520,10 @@ def build_context( if node_ctx: parts.append(node_ctx) + snapshot_ctx = snapshot_context(prompt, snapshot) + if snapshot_ctx: + parts.append(snapshot_ctx) + if allow_tools: # Scope pod summaries to relevant namespaces/workloads when possible. prefixes_by_ns: dict[str, set[str]] = collections.defaultdict(set) @@ -1311,6 +1554,33 @@ def build_context( return "\n\n".join([p for p in parts if p]).strip() +def snapshot_context(prompt: str, snapshot: dict[str, Any] | None) -> str: + if not snapshot: + return "" + metrics = _snapshot_metrics(snapshot) + workloads = _snapshot_workloads(snapshot) + q = normalize_query(prompt) + parts: list[str] = [] + nodes = snapshot.get("nodes") if isinstance(snapshot.get("nodes"), dict) else {} + if nodes.get("total") is not None: + parts.append( + f"Snapshot: nodes_total={nodes.get('total')}, ready={nodes.get('ready')}, not_ready={nodes.get('not_ready')}." + ) + if any(word in q for word in ("postgres", "connections", "db")): + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} + if postgres: + parts.append(f"Snapshot: postgres_connections={postgres}.") + if any(word in q for word in ("hottest", "cpu", "ram", "memory", "net", "network", "io", "disk")): + hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + if hottest: + parts.append(f"Snapshot: hottest_nodes={hottest}.") + if workloads and any(word in q for word in ("run", "running", "host", "node", "where", "which")): + match = _select_workload(prompt, workloads) + if match: + parts.append(f"Snapshot: workload={match}.") + return "\n".join(parts).strip() + + def _knowledge_intent(prompt: str) -> bool: q = normalize_query(prompt) return any( @@ -1350,7 +1620,8 @@ def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str: kb_titles = kb_retrieve_titles(prompt, limit=4) if kb_titles: parts.append(kb_titles) - return "\n".join(parts).strip() + summary = "\n".join(parts).strip() + return _format_confidence(summary, "medium") if summary else "" def _ollama_call(hist_key, prompt: str, *, context: str) -> str: system = ( @@ -1360,7 +1631,8 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "Never include or request secret values. " "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " - "If the answer is not grounded in the provided context or tool data, say you do not know." + "If the answer is not grounded in the provided context or tool data, say you do not know. " + "End every response with a line: 'Confidence: high|medium|low'." ) transcript_parts = [system] if context: @@ -1491,8 +1763,18 @@ def sync_loop(token: str, room_id: str): if isinstance(w, dict) and w.get("name"): targets.append((ns, str(w["name"]))) + snapshot = _snapshot_state() inventory = node_inventory_for_prompt(body) - context = build_context(body, allow_tools=allow_tools, targets=targets, inventory=inventory) + if not inventory: + inventory = _snapshot_inventory(snapshot) + workloads = _snapshot_workloads(snapshot) + context = build_context( + body, + allow_tools=allow_tools, + targets=targets, + inventory=inventory, + snapshot=snapshot, + ) if allow_tools and promql: res = vm_query(promql, timeout=20) rendered = vm_render_result(res, limit=15) or "(no results)" @@ -1506,7 +1788,13 @@ def sync_loop(token: str, room_id: str): if not fallback and context: fallback = _context_fallback(context) - structured = structured_answer(body, inventory=inventory, metrics_summary=metrics_fallback or "") + structured = structured_answer( + body, + inventory=inventory, + metrics_summary=metrics_fallback or "", + snapshot=snapshot, + workloads=workloads, + ) if structured: send_msg(token, rid, structured) continue From 1459027abcee8aaa162a4e1c9fa1cd6fa48157be Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 08:50:29 +0000 Subject: [PATCH 291/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 1392855..0f8cd2a 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-56 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-57 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 4fcecc470737cb3c2f6b8d154a3912bb5aea4235 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 09:00:40 +0000 Subject: [PATCH 292/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 0f8cd2a..e4580aa 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-57 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-58 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From a2f4c51e1de5447a3c08c814c77d5f0ad38ab7f4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 06:28:03 -0300 Subject: [PATCH 293/416] atlasbot: shift to facts context and upgrade model --- services/ai-llm/deployment.yaml | 4 +- services/comms/atlasbot-deployment.yaml | 4 +- services/comms/scripts/atlasbot/bot.py | 203 +++++++++++++++++------- 3 files changed, 151 insertions(+), 60 deletions(-) diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml index 43d14c8..bf012c0 100644 --- a/services/ai-llm/deployment.yaml +++ b/services/ai-llm/deployment.yaml @@ -20,7 +20,7 @@ spec: labels: app: ollama annotations: - ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0 + ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0 ai.bstein.dev/gpu: GPU pool (titan-22/24) ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z" spec: @@ -52,7 +52,7 @@ spec: - name: OLLAMA_MODELS value: /root/.ollama - name: OLLAMA_MODEL - value: qwen2.5:7b-instruct-q4_0 + value: qwen2.5:14b-instruct-q4_0 command: - /bin/sh - -c diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 0ee86f0..f4883c4 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-29 + checksum/atlasbot-configmap: manual-atlasbot-30 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -82,7 +82,7 @@ spec: - name: OLLAMA_URL value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL - value: qwen2.5:7b-instruct-q4_0 + value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC value: "600" - name: ATLASBOT_THINKING_INTERVAL_SEC diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 9f6c38d..a91744d 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -33,7 +33,10 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500")) +MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000")) THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120")) +OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2")) +OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false" TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE) HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)") @@ -113,6 +116,8 @@ METRIC_HINTS = { "connections": ("connections", "conn", "postgres", "database", "db"), } +_OLLAMA_LOCK = threading.Lock() + HARDWARE_HINTS = { "amd64": ("amd64", "x86", "x86_64", "x86-64"), "jetson": ("jetson",), @@ -638,6 +643,105 @@ def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]: return node_inventory() return [] +def _nodes_by_arch(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: + grouped: dict[str, list[str]] = collections.defaultdict(list) + for node in inventory: + grouped[(node.get("arch") or "unknown")].append(node["name"]) + return {k: sorted(v) for k, v in grouped.items()} + +def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]: + usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} + per_node: dict[str, dict[str, Any]] = {} + for metric_name, entries in usage.items() if isinstance(usage, dict) else []: + if not isinstance(entries, list): + continue + for entry in entries: + if not isinstance(entry, dict): + continue + node = entry.get("node") + if not isinstance(node, str) or not node: + continue + per_node.setdefault(node, {})[metric_name] = entry.get("value") + return [{"node": node, **vals} for node, vals in sorted(per_node.items())] + +def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> list[dict[str, Any]]: + cleaned: list[dict[str, Any]] = [] + for entry in workloads: + if not isinstance(entry, dict): + continue + cleaned.append( + { + "namespace": entry.get("namespace"), + "workload": entry.get("workload"), + "pods_total": entry.get("pods_total"), + "pods_running": entry.get("pods_running"), + "primary_node": entry.get("primary_node"), + "nodes": entry.get("nodes"), + } + ) + cleaned.sort( + key=lambda item: ( + -(item.get("pods_total") or 0), + str(item.get("namespace") or ""), + str(item.get("workload") or ""), + ) + ) + return cleaned[:limit] + +def facts_context( + prompt: str, + *, + inventory: list[dict[str, Any]] | None, + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]] | None, +) -> str: + inv = inventory or [] + metrics = _snapshot_metrics(snapshot) + nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {} + summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {} + expected_workers = expected_worker_nodes_from_metrics() + ready_workers, not_ready_workers = worker_nodes_status(inv) if inv else ([], []) + + facts: dict[str, Any] = { + "generated_at": snapshot.get("generated_at") if isinstance(snapshot, dict) else None, + "nodes": { + "total": summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total"), + "ready": summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready"), + "not_ready": summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready"), + "not_ready_names": summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names"), + "by_hardware": _group_nodes(inv) if inv else {}, + "by_arch": _nodes_by_arch(inv) if inv else {}, + "workers_ready": ready_workers, + "workers_not_ready": not_ready_workers, + "expected_workers": expected_workers, + }, + "metrics": { + "hottest_nodes": metrics.get("hottest_nodes") if isinstance(metrics, dict) else {}, + "postgres_connections": metrics.get("postgres_connections") if isinstance(metrics, dict) else {}, + "node_usage": _node_usage_table(metrics) if isinstance(metrics, dict) else [], + }, + "workloads": _workloads_for_facts(workloads or []), + } + + rendered = json.dumps(facts, ensure_ascii=False) + if len(rendered) <= MAX_FACTS_CHARS: + return "Facts (live snapshot):\n" + rendered + + trimmed = dict(facts) + trimmed.pop("workloads", None) + rendered = json.dumps(trimmed, ensure_ascii=False) + if len(rendered) <= MAX_FACTS_CHARS: + return "Facts (live snapshot):\n" + rendered + + trimmed_metrics = dict(trimmed.get("metrics") or {}) + trimmed_metrics.pop("node_usage", None) + trimmed["metrics"] = trimmed_metrics + rendered = json.dumps(trimmed, ensure_ascii=False) + if len(rendered) <= MAX_FACTS_CHARS: + return "Facts (live snapshot):\n" + rendered + + return "Facts (live snapshot):\n" + rendered[:MAX_FACTS_CHARS] + def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: names = [node["name"] for node in inventory] ready = [node["name"] for node in inventory if node.get("ready") is True] @@ -1463,26 +1567,19 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) - answer = structured_answer( + context = build_context( prompt, + allow_tools=False, + targets=[], inventory=inventory, - metrics_summary="", snapshot=snapshot, workloads=workloads, ) - if not answer and _knowledge_intent(prompt): - answer = knowledge_summary(prompt, inventory) - if not answer: - kb = kb_retrieve_titles(prompt, limit=4) - context = build_context( - prompt, - allow_tools=False, - targets=[], - inventory=inventory, - snapshot=snapshot, - ) - fallback = kb or "I don't have enough data to answer that." - answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback) + metrics_context, metrics_fallback = metrics_query_context(prompt, allow_tools=True) + if metrics_context: + context = (context + "\n\n" + metrics_context).strip() if context else metrics_context + fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that." + answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback) self._write_json(200, {"answer": answer}) @@ -1505,10 +1602,13 @@ def build_context( targets: list[tuple[str, str]], inventory: list[dict[str, Any]] | None = None, snapshot: dict[str, Any] | None = None, + workloads: list[dict[str, Any]] | None = None, ) -> str: parts: list[str] = [] kb = kb_retrieve(prompt) + if not kb and _knowledge_intent(prompt): + kb = kb_retrieve_titles(prompt, limit=4) if kb: parts.append(kb) @@ -1516,13 +1616,9 @@ def build_context( if endpoints: parts.append(endpoints) - node_ctx = node_inventory_context(prompt, inventory) - if node_ctx: - parts.append(node_ctx) - - snapshot_ctx = snapshot_context(prompt, snapshot) - if snapshot_ctx: - parts.append(snapshot_ctx) + facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + if facts: + parts.append(facts) if allow_tools: # Scope pod summaries to relevant namespaces/workloads when possible. @@ -1627,7 +1723,9 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: system = ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " - "Prefer answering with exact repo paths and Kubernetes resource names. " + "Use the provided context and facts as your source of truth. " + "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. " + "Prefer exact repo paths and Kubernetes resource names when relevant. " "Never include or request secret values. " "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " @@ -1646,21 +1744,32 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: if API_KEY: headers["x-api-key"] = API_KEY r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) - with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: - data = json.loads(resp.read().decode()) - raw_reply = data.get("message") or data.get("response") or data.get("reply") or data - reply = _normalize_reply(raw_reply) or "I'm here to help." - history[hist_key].append(f"Atlas: {reply}") - return reply + lock = _OLLAMA_LOCK if OLLAMA_SERIALIZE else None + if lock: + lock.acquire() + try: + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: + data = json.loads(resp.read().decode()) + raw_reply = data.get("message") or data.get("response") or data.get("reply") or data + reply = _normalize_reply(raw_reply) or "I'm here to help." + history[hist_key].append(f"Atlas: {reply}") + return reply + finally: + if lock: + lock.release() def ollama_reply(hist_key, prompt: str, *, context: str, fallback: str = "") -> str: - try: - return _ollama_call(hist_key, prompt, context=context) - except Exception: - if fallback: - history[hist_key].append(f"Atlas: {fallback}") - return fallback - return "Model backend is busy. Try again in a moment." + last_error = None + for attempt in range(max(1, OLLAMA_RETRIES + 1)): + try: + return _ollama_call(hist_key, prompt, context=context) + except Exception as exc: # noqa: BLE001 + last_error = exc + time.sleep(min(4, 2 ** attempt)) + if fallback: + history[hist_key].append(f"Atlas: {fallback}") + return fallback + return "I don't have enough data to answer that." def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, context: str, fallback: str) -> str: result: dict[str, str] = {"reply": ""} @@ -1774,6 +1883,7 @@ def sync_loop(token: str, room_id: str): targets=targets, inventory=inventory, snapshot=snapshot, + workloads=workloads, ) if allow_tools and promql: res = vm_query(promql, timeout=20) @@ -1784,26 +1894,7 @@ def sync_loop(token: str, room_id: str): if metrics_context: context = (context + "\n\n" + metrics_context).strip() if context else metrics_context - fallback = metrics_fallback or "" - if not fallback and context: - fallback = _context_fallback(context) - - structured = structured_answer( - body, - inventory=inventory, - metrics_summary=metrics_fallback or "", - snapshot=snapshot, - workloads=workloads, - ) - if structured: - send_msg(token, rid, structured) - continue - - if _knowledge_intent(body): - summary = knowledge_summary(body, inventory) - if summary: - send_msg(token, rid, summary) - continue + fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that." reply = ollama_reply_with_thinking( token, From 446115f07a335f976697b3ceb8fc4a2e1356930b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 06:34:37 -0300 Subject: [PATCH 294/416] atlasbot: enrich facts summary for LLM --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 46 +++++++++++++++++++++---- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index f4883c4..377a076 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-30 + checksum/atlasbot-configmap: manual-atlasbot-31 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index a91744d..3f05529 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -723,24 +723,55 @@ def facts_context( "workloads": _workloads_for_facts(workloads or []), } + summary_lines: list[str] = [] + nodes_info = facts.get("nodes") if isinstance(facts.get("nodes"), dict) else {} + if nodes_info.get("total") is not None: + summary_lines.append( + f"nodes_total={nodes_info.get('total')}, ready={nodes_info.get('ready')}, not_ready={nodes_info.get('not_ready')}" + ) + hottest = facts.get("metrics", {}).get("hottest_nodes") if isinstance(facts.get("metrics"), dict) else {} + if isinstance(hottest, dict) and hottest: + for key in ("cpu", "ram", "net", "io"): + entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} + node = entry.get("node") + value = entry.get("value") + if node and value is not None: + summary_lines.append(f"hottest_{key}={node} ({value})") + postgres = facts.get("metrics", {}).get("postgres_connections") if isinstance(facts.get("metrics"), dict) else {} + if isinstance(postgres, dict) and postgres: + used = postgres.get("used") + max_conn = postgres.get("max") + if used is not None and max_conn is not None: + summary_lines.append(f"postgres_used={used}, postgres_max={max_conn}") + hottest_db = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} + if hottest_db.get("label"): + summary_lines.append(f"postgres_hottest_db={hottest_db.get('label')} ({hottest_db.get('value')})") + rendered = json.dumps(facts, ensure_ascii=False) - if len(rendered) <= MAX_FACTS_CHARS: - return "Facts (live snapshot):\n" + rendered + rendered_parts = [] + if summary_lines: + rendered_parts.append("Facts summary:\n" + "\n".join(f"- {line}" for line in summary_lines)) + rendered_parts.append("Facts (live snapshot JSON):\n" + rendered) + combined = "\n\n".join(rendered_parts) + if len(combined) <= MAX_FACTS_CHARS: + return combined trimmed = dict(facts) trimmed.pop("workloads", None) rendered = json.dumps(trimmed, ensure_ascii=False) - if len(rendered) <= MAX_FACTS_CHARS: - return "Facts (live snapshot):\n" + rendered + combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered]) + if len(combined) <= MAX_FACTS_CHARS: + return combined trimmed_metrics = dict(trimmed.get("metrics") or {}) trimmed_metrics.pop("node_usage", None) trimmed["metrics"] = trimmed_metrics rendered = json.dumps(trimmed, ensure_ascii=False) - if len(rendered) <= MAX_FACTS_CHARS: - return "Facts (live snapshot):\n" + rendered + combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered]) + if len(combined) <= MAX_FACTS_CHARS: + return combined - return "Facts (live snapshot):\n" + rendered[:MAX_FACTS_CHARS] + return combined[:MAX_FACTS_CHARS] def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: names = [node["name"] for node in inventory] @@ -1724,6 +1755,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " "Use the provided context and facts as your source of truth. " + "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. " "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. " "Prefer exact repo paths and Kubernetes resource names when relevant. " "Never include or request secret values. " From c0dd00c93d05fce37c9e89e2a7ff5ab406696c3e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 06:45:18 -0300 Subject: [PATCH 295/416] atlasbot: shrink facts context to avoid truncation --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 148 ++++++++++++++---------- 2 files changed, 89 insertions(+), 61 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 377a076..7cb2d7d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-31 + checksum/atlasbot-configmap: manual-atlasbot-32 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 3f05529..9e8e0dd 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -688,6 +688,20 @@ def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> li ) return cleaned[:limit] +def _workloads_for_prompt(prompt: str, workloads: list[dict[str, Any]], limit: int = 12) -> list[dict[str, Any]]: + tokens = set(_tokens(prompt)) + if tokens: + matched: list[dict[str, Any]] = [] + for entry in workloads: + if not isinstance(entry, dict): + continue + entry_tokens = _workload_tokens(entry) + if entry_tokens & tokens: + matched.append(entry) + if matched: + return _workloads_for_facts(matched, limit=limit) + return _workloads_for_facts(workloads, limit=limit) + def facts_context( prompt: str, *, @@ -701,77 +715,91 @@ def facts_context( summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {} expected_workers = expected_worker_nodes_from_metrics() ready_workers, not_ready_workers = worker_nodes_status(inv) if inv else ([], []) + total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total") + ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready") + not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready") + not_ready_names = summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names") + by_hardware = _group_nodes(inv) if inv else {} + by_arch = _nodes_by_arch(inv) if inv else {} - facts: dict[str, Any] = { - "generated_at": snapshot.get("generated_at") if isinstance(snapshot, dict) else None, - "nodes": { - "total": summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total"), - "ready": summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready"), - "not_ready": summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready"), - "not_ready_names": summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names"), - "by_hardware": _group_nodes(inv) if inv else {}, - "by_arch": _nodes_by_arch(inv) if inv else {}, - "workers_ready": ready_workers, - "workers_not_ready": not_ready_workers, - "expected_workers": expected_workers, - }, - "metrics": { - "hottest_nodes": metrics.get("hottest_nodes") if isinstance(metrics, dict) else {}, - "postgres_connections": metrics.get("postgres_connections") if isinstance(metrics, dict) else {}, - "node_usage": _node_usage_table(metrics) if isinstance(metrics, dict) else [], - }, - "workloads": _workloads_for_facts(workloads or []), - } - - summary_lines: list[str] = [] - nodes_info = facts.get("nodes") if isinstance(facts.get("nodes"), dict) else {} - if nodes_info.get("total") is not None: - summary_lines.append( - f"nodes_total={nodes_info.get('total')}, ready={nodes_info.get('ready')}, not_ready={nodes_info.get('not_ready')}" + lines: list[str] = ["Facts (live snapshot):"] + if total is not None: + lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}") + if not_ready_names: + lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}") + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + nodes_list = by_hardware.get(key) or [] + if nodes_list: + lines.append(f"- {key}: {', '.join(nodes_list)}") + for key, nodes_list in sorted(by_arch.items()): + if nodes_list: + lines.append(f"- arch {key}: {', '.join(nodes_list)}") + if ready_workers or not_ready_workers: + lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}") + if not_ready_workers: + lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}") + if expected_workers: + missing = sorted( + set(expected_workers) + - {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")} ) - hottest = facts.get("metrics", {}).get("hottest_nodes") if isinstance(facts.get("metrics"), dict) else {} - if isinstance(hottest, dict) and hottest: - for key in ("cpu", "ram", "net", "io"): - entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} - node = entry.get("node") - value = entry.get("value") - if node and value is not None: - summary_lines.append(f"hottest_{key}={node} ({value})") - postgres = facts.get("metrics", {}).get("postgres_connections") if isinstance(facts.get("metrics"), dict) else {} + lines.append(f"- expected_workers: {', '.join(expected_workers)}") + if missing: + lines.append(f"- expected_workers_missing: {', '.join(missing)}") + + hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + for key in ("cpu", "ram", "net", "io"): + entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} + node = entry.get("node") + value = entry.get("value") + if node and value is not None: + lines.append(f"- hottest_{key}: {node} ({value})") + + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} if isinstance(postgres, dict) and postgres: used = postgres.get("used") max_conn = postgres.get("max") if used is not None and max_conn is not None: - summary_lines.append(f"postgres_used={used}, postgres_max={max_conn}") + lines.append(f"- postgres_connections: {used} used / {max_conn} max") hottest_db = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} if hottest_db.get("label"): - summary_lines.append(f"postgres_hottest_db={hottest_db.get('label')} ({hottest_db.get('value')})") + lines.append( + f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})" + ) - rendered = json.dumps(facts, ensure_ascii=False) - rendered_parts = [] - if summary_lines: - rendered_parts.append("Facts summary:\n" + "\n".join(f"- {line}" for line in summary_lines)) - rendered_parts.append("Facts (live snapshot JSON):\n" + rendered) - combined = "\n\n".join(rendered_parts) - if len(combined) <= MAX_FACTS_CHARS: - return combined + usage_table = _node_usage_table(metrics) + if usage_table: + lines.append("- node_usage (cpu/ram/net/io):") + for entry in usage_table: + node = entry.get("node") + if not node: + continue + cpu = entry.get("cpu") + ram = entry.get("ram") + net = entry.get("net") + io_val = entry.get("io") + lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") - trimmed = dict(facts) - trimmed.pop("workloads", None) - rendered = json.dumps(trimmed, ensure_ascii=False) - combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered]) - if len(combined) <= MAX_FACTS_CHARS: - return combined + workload_entries = _workloads_for_prompt(prompt, workloads or []) + if workload_entries: + lines.append("- workloads:") + for entry in workload_entries: + if not isinstance(entry, dict): + continue + ns = entry.get("namespace") or "" + wl = entry.get("workload") or "" + primary = entry.get("primary_node") or "" + pods_total = entry.get("pods_total") + label = f"{ns}/{wl}" if ns and wl else (wl or ns) + if not label: + continue + if primary: + lines.append(f" - {label}: primary_node={primary}, pods_total={pods_total}") + else: + lines.append(f" - {label}: pods_total={pods_total}") - trimmed_metrics = dict(trimmed.get("metrics") or {}) - trimmed_metrics.pop("node_usage", None) - trimmed["metrics"] = trimmed_metrics - rendered = json.dumps(trimmed, ensure_ascii=False) - combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered]) - if len(combined) <= MAX_FACTS_CHARS: - return combined - - return combined[:MAX_FACTS_CHARS] + rendered = "\n".join(lines) + return rendered[:MAX_FACTS_CHARS] def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: names = [node["name"] for node in inventory] From a442ea6d5d9abae9b8397bad6b3e7db8e6151881 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 11:03:55 -0300 Subject: [PATCH 296/416] atlasbot: strengthen facts context and replies --- services/comms/scripts/atlasbot/bot.py | 91 +++++++++++++++++++------- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 9e8e0dd..e0056f8 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -34,6 +34,7 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500")) MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000")) +MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000")) THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120")) OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2")) OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false" @@ -100,6 +101,7 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE) TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE) _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D" +CONFIDENCE_RE = re.compile(r"confidence\s*:\s*(high|medium|low)\b", re.IGNORECASE) OPERATION_HINTS = { "count": ("how many", "count", "number", "total"), @@ -139,6 +141,20 @@ def _tokens(text: str) -> list[str]: return [t for t in toks if t not in STOPWORDS and len(t) >= 2] +def _ensure_confidence(text: str) -> str: + if not text: + return "" + lines = text.strip().splitlines() + for idx, line in enumerate(lines): + match = CONFIDENCE_RE.search(line) + if match: + level = match.group(1).lower() + lines[idx] = f"Confidence: {level}" + return "\n".join(lines) + lines.append("Confidence: medium") + return "\n".join(lines) + + # Mention detection (Matrix rich mentions + plain @atlas). MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()] MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS] @@ -710,6 +726,7 @@ def facts_context( workloads: list[dict[str, Any]] | None, ) -> str: inv = inventory or [] + nodes_in_query = _extract_titan_nodes(prompt) metrics = _snapshot_metrics(snapshot) nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {} summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {} @@ -721,6 +738,12 @@ def facts_context( not_ready_names = summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names") by_hardware = _group_nodes(inv) if inv else {} by_arch = _nodes_by_arch(inv) if inv else {} + control_plane_nodes = [ + node["name"] + for node in inv + if any(role in ("control-plane", "master") for role in (node.get("roles") or [])) + ] + worker_nodes = [node["name"] for node in inv if node.get("is_worker") is True] lines: list[str] = ["Facts (live snapshot):"] if total is not None: @@ -731,9 +754,16 @@ def facts_context( nodes_list = by_hardware.get(key) or [] if nodes_list: lines.append(f"- {key}: {', '.join(nodes_list)}") + non_rpi = sorted(set(by_hardware.get("jetson", [])) | set(by_hardware.get("amd64", []))) + if non_rpi: + lines.append(f"- non_raspberry_pi: {', '.join(non_rpi)}") for key, nodes_list in sorted(by_arch.items()): if nodes_list: lines.append(f"- arch {key}: {', '.join(nodes_list)}") + if control_plane_nodes: + lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}") + if worker_nodes: + lines.append(f"- worker_nodes: {', '.join(worker_nodes)}") if ready_workers or not_ready_workers: lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}") if not_ready_workers: @@ -753,7 +783,8 @@ def facts_context( node = entry.get("node") value = entry.get("value") if node and value is not None: - lines.append(f"- hottest_{key}: {node} ({value})") + value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram")) + lines.append(f"- hottest_{key}: {node} ({value_fmt})") postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} if isinstance(postgres, dict) and postgres: @@ -774,12 +805,25 @@ def facts_context( node = entry.get("node") if not node: continue - cpu = entry.get("cpu") - ram = entry.get("ram") - net = entry.get("net") - io_val = entry.get("io") + cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else "" + ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else "" + net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else "" + io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else "" lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") + if nodes_in_query: + lines.append("- node_details:") + for name in nodes_in_query: + detail = next((n for n in inv if n.get("name") == name), None) + if not detail: + lines.append(f" - {name}: not found in snapshot") + continue + roles = ",".join(detail.get("roles") or []) or "none" + lines.append( + f" - {name}: hardware={detail.get('hardware')}, arch={detail.get('arch')}, " + f"ready={detail.get('ready')}, roles={roles}" + ) + workload_entries = _workloads_for_prompt(prompt, workloads or []) if workload_entries: lines.append("- workloads:") @@ -1181,11 +1225,10 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: if rendered: rendered_parts.append(rendered) if not rendered_parts: - return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data." + return "", "" summary = "\n".join(rendered_parts) context = f"Metrics (from {dashboard} / {panel}):\n{summary}" - fallback = _metrics_fallback_summary(panel, summary) - return context, fallback + return context, "" def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]: q = (query or "").strip() @@ -1574,8 +1617,8 @@ def _normalize_reply(value: Any) -> str: try: return _normalize_reply(json.loads(text)) except Exception: - return text - return text + return _ensure_confidence(text) + return _ensure_confidence(text) # Internal HTTP endpoint for cluster answers (website uses this). @@ -1634,10 +1677,10 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot=snapshot, workloads=workloads, ) - metrics_context, metrics_fallback = metrics_query_context(prompt, allow_tools=True) + metrics_context, _metrics_fallback = metrics_query_context(prompt, allow_tools=True) if metrics_context: context = (context + "\n\n" + metrics_context).strip() if context else metrics_context - fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that." + fallback = "I don't have enough data to answer that." answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback) self._write_json(200, {"answer": answer}) @@ -1665,19 +1708,19 @@ def build_context( ) -> str: parts: list[str] = [] - kb = kb_retrieve(prompt) - if not kb and _knowledge_intent(prompt): - kb = kb_retrieve_titles(prompt, limit=4) - if kb: - parts.append(kb) + facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + if facts: + parts.append(facts) endpoints, edges = catalog_hints(prompt) if endpoints: parts.append(endpoints) - facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) - if facts: - parts.append(facts) + kb = kb_retrieve(prompt) + if not kb and _knowledge_intent(prompt): + kb = kb_retrieve_titles(prompt, limit=4) + if kb: + parts.append(kb) if allow_tools: # Scope pod summaries to relevant namespaces/workloads when possible. @@ -1789,12 +1832,14 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "Never include or request secret values. " "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " + "Translate metrics into natural language instead of echoing raw label/value pairs. " + "Do not answer by only listing runbooks; summarize the cluster first and mention docs only if useful. " "If the answer is not grounded in the provided context or tool data, say you do not know. " "End every response with a line: 'Confidence: high|medium|low'." ) transcript_parts = [system] if context: - transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS]) + transcript_parts.append("Context (grounded):\n" + context[:MAX_CONTEXT_CHARS]) transcript_parts.extend(history[hist_key][-24:]) transcript_parts.append(f"User: {prompt}") transcript = "\n".join(transcript_parts) @@ -1950,11 +1995,11 @@ def sync_loop(token: str, room_id: str): rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra - metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics) + metrics_context, _metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics) if metrics_context: context = (context + "\n\n" + metrics_context).strip() if context else metrics_context - fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that." + fallback = "I don't have enough data to answer that." reply = ollama_reply_with_thinking( token, From 32125d7bab3c25fb7d90278580cf1d940ea8f741 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 11:05:30 -0300 Subject: [PATCH 297/416] comms: bump atlasbot configmap checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7cb2d7d..93b5108 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-32 + checksum/atlasbot-configmap: manual-atlasbot-33 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 32851ca0579c835aa9c4dcad4f39e78b5acb6f9b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:20:50 -0300 Subject: [PATCH 298/416] comms: point atlasbot to ollama and raise gateway memory --- services/bstein-dev-home/chat-ai-gateway-deployment.yaml | 4 ++-- services/comms/atlasbot-deployment.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml index 7209da6..e572406 100644 --- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml +++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml @@ -67,10 +67,10 @@ spec: resources: requests: cpu: 20m - memory: 64Mi + memory: 128Mi limits: cpu: 200m - memory: 256Mi + memory: 512Mi volumeMounts: - name: code mountPath: /app/gateway.py diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 93b5108..d41f97c 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -80,7 +80,7 @@ spec: - name: BOT_MENTIONS value: atlasbot,aatlasbot - name: OLLAMA_URL - value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ + value: http://ollama.ai.svc.cluster.local:11434/ - name: OLLAMA_MODEL value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC From d8ae9c5901df2c45f88eec504d27482d5ab8157c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:23:05 -0300 Subject: [PATCH 299/416] comms: restore atlasbot gateway URL --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d41f97c..93b5108 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -80,7 +80,7 @@ spec: - name: BOT_MENTIONS value: atlasbot,aatlasbot - name: OLLAMA_URL - value: http://ollama.ai.svc.cluster.local:11434/ + value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC From 3b1e74d2784a62eb2190d1f9ba41bbd61ce5a8d3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:33:56 -0300 Subject: [PATCH 300/416] atlasbot: call ollama chat directly --- services/comms/atlasbot-deployment.yaml | 4 +- services/comms/scripts/atlasbot/bot.py | 55 +++++++++++++++++++++---- 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 93b5108..7ec373f 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-33 + checksum/atlasbot-configmap: manual-atlasbot-34 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -80,7 +80,7 @@ spec: - name: BOT_MENTIONS value: atlasbot,aatlasbot - name: OLLAMA_URL - value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ + value: http://ollama.ai.svc.cluster.local:11434 - name: OLLAMA_MODEL value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e0056f8..6644afb 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -155,6 +155,37 @@ def _ensure_confidence(text: str) -> str: return "\n".join(lines) +def _ollama_endpoint() -> str: + url = (OLLAMA_URL or "").strip() + if not url: + return "" + if url.endswith("/api/chat"): + return url + return url.rstrip("/") + "/api/chat" + + +def _history_to_messages(lines: list[str]) -> list[dict[str, str]]: + messages: list[dict[str, str]] = [] + for line in lines: + raw = (line or "").strip() + if not raw: + continue + role = "user" + content = raw + lowered = raw.lower() + if lowered.startswith("atlas:"): + role = "assistant" + content = raw.split(":", 1)[1].strip() + elif lowered.startswith("user:"): + role = "user" + content = raw.split(":", 1)[1].strip() + elif ":" in raw: + content = raw.split(":", 1)[1].strip() + if content: + messages.append({"role": role, "content": content}) + return messages + + # Mention detection (Matrix rich mentions + plain @atlas). MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()] MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS] @@ -1837,25 +1868,33 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "If the answer is not grounded in the provided context or tool data, say you do not know. " "End every response with a line: 'Confidence: high|medium|low'." ) - transcript_parts = [system] + endpoint = _ollama_endpoint() + if not endpoint: + raise RuntimeError("ollama endpoint missing") + system_content = system if context: - transcript_parts.append("Context (grounded):\n" + context[:MAX_CONTEXT_CHARS]) - transcript_parts.extend(history[hist_key][-24:]) - transcript_parts.append(f"User: {prompt}") - transcript = "\n".join(transcript_parts) + system_content += "\n\nContext (grounded):\n" + context[:MAX_CONTEXT_CHARS] - payload = {"model": MODEL, "message": transcript} + messages: list[dict[str, str]] = [{"role": "system", "content": system_content}] + messages.extend(_history_to_messages(history[hist_key][-24:])) + messages.append({"role": "user", "content": prompt}) + + payload = {"model": MODEL, "messages": messages, "stream": False} headers = {"Content-Type": "application/json"} if API_KEY: headers["x-api-key"] = API_KEY - r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) + r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers) lock = _OLLAMA_LOCK if OLLAMA_SERIALIZE else None if lock: lock.acquire() try: with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: data = json.loads(resp.read().decode()) - raw_reply = data.get("message") or data.get("response") or data.get("reply") or data + msg = data.get("message") if isinstance(data, dict) else None + if isinstance(msg, dict): + raw_reply = msg.get("content") + else: + raw_reply = data.get("response") or data.get("reply") or data reply = _normalize_reply(raw_reply) or "I'm here to help." history[hist_key].append(f"Atlas: {reply}") return reply From 41b131c347a731a40b2ec6d8aa52543d6b833cdf Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:47:28 -0300 Subject: [PATCH 301/416] atlasbot: preserve response text with confidence --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7ec373f..b3e617d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-34 + checksum/atlasbot-configmap: manual-atlasbot-35 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6644afb..c790f5c 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -149,7 +149,7 @@ def _ensure_confidence(text: str) -> str: match = CONFIDENCE_RE.search(line) if match: level = match.group(1).lower() - lines[idx] = f"Confidence: {level}" + lines[idx] = CONFIDENCE_RE.sub(f"Confidence: {level}", line) return "\n".join(lines) lines.append("Confidence: medium") return "\n".join(lines) From b7f454b7908e9ad41ad5dc57efe884aee2d495ec Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:53:17 -0300 Subject: [PATCH 302/416] atlasbot: enrich snapshot facts and pod metrics --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 56 ++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index b3e617d..fd2f399 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-35 + checksum/atlasbot-configmap: manual-atlasbot-36 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index c790f5c..0330620 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -95,6 +95,8 @@ METRIC_HINT_WORDS = { "pending", "unreachable", "latency", + "pod", + "pods", } CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) @@ -116,6 +118,7 @@ METRIC_HINTS = { "net": ("net", "network", "bandwidth", "throughput"), "io": ("io", "disk", "storage"), "connections": ("connections", "conn", "postgres", "database", "db"), + "pods": ("pods", "pod"), } _OLLAMA_LOCK = threading.Lock() @@ -488,13 +491,15 @@ def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool: return "* 100" in expr or "*100" in expr -def _format_metric_value(value: str, *, percent: bool) -> str: +def _format_metric_value(value: str, *, percent: bool, rate: bool = False) -> str: try: num = float(value) except (TypeError, ValueError): return value if percent: return f"{num:.1f}%" + if rate: + return _humanize_rate(value, unit="rate") if abs(num) >= 1: return f"{num:.2f}".rstrip("0").rstrip(".") return f"{num:.4f}".rstrip("0").rstrip(".") @@ -779,6 +784,11 @@ def facts_context( lines: list[str] = ["Facts (live snapshot):"] if total is not None: lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}") + if isinstance(summary, dict): + by_arch_counts = summary.get("by_arch") + if isinstance(by_arch_counts, dict) and by_arch_counts: + parts = [f"{arch}={count}" for arch, count in sorted(by_arch_counts.items())] + lines.append(f"- nodes_by_arch: {', '.join(parts)}") if not_ready_names: lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}") for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): @@ -799,7 +809,7 @@ def facts_context( lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}") if not_ready_workers: lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}") - if expected_workers: + if expected_workers and any(word in normalize_query(prompt) for word in ("missing", "expected", "should", "not ready", "unready")): missing = sorted( set(expected_workers) - {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")} @@ -814,7 +824,11 @@ def facts_context( node = entry.get("node") value = entry.get("value") if node and value is not None: - value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram")) + value_fmt = _format_metric_value( + str(value), + percent=key in ("cpu", "ram"), + rate=key in ("net", "io"), + ) lines.append(f"- hottest_{key}: {node} ({value_fmt})") postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} @@ -829,6 +843,11 @@ def facts_context( f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})" ) + for key in ("pods_running", "pods_pending", "pods_failed", "pods_succeeded"): + value = metrics.get(key) + if value is not None: + lines.append(f"- {key}: {value}") + usage_table = _node_usage_table(metrics) if usage_table: lines.append("- node_usage (cpu/ram/net/io):") @@ -838,8 +857,16 @@ def facts_context( continue cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else "" ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else "" - net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else "" - io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else "" + net = ( + _format_metric_value(str(entry.get("net")), percent=False, rate=True) + if entry.get("net") is not None + else "" + ) + io_val = ( + _format_metric_value(str(entry.get("io")), percent=False, rate=True) + if entry.get("io") is not None + else "" + ) lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") if nodes_in_query: @@ -1029,7 +1056,7 @@ def snapshot_metric_answer( if top: node, val = top percent = metric in {"cpu", "ram"} - value = _format_metric_value(str(val), percent=percent) + value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"}) scope = "" if include_hw: scope = f" among {' and '.join(sorted(include_hw))}" @@ -1051,6 +1078,23 @@ def snapshot_metric_answer( if parts: return _format_confidence(" ".join(parts), "high") + if metric == "pods": + running = metrics.get("pods_running") + pending = metrics.get("pods_pending") + failed = metrics.get("pods_failed") + succeeded = metrics.get("pods_succeeded") + parts = [] + if running is not None: + parts.append(f"running {running:.0f}") + if pending is not None: + parts.append(f"pending {pending:.0f}") + if failed is not None: + parts.append(f"failed {failed:.0f}") + if succeeded is not None: + parts.append(f"succeeded {succeeded:.0f}") + if parts: + return _format_confidence(f"Pods: {', '.join(parts)}.", "high") + return "" def structured_answer( From 159c9cfe68896f16ec5d73ac7d8ca2a2ede2a601 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:59:11 -0300 Subject: [PATCH 303/416] atlasbot: use structured answers before LLM --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index fd2f399..7fdbf64 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-36 + checksum/atlasbot-configmap: manual-atlasbot-37 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 0330620..ff528ea 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1744,6 +1744,17 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) + metrics_summary = snapshot_context(prompt, snapshot) + structured = structured_answer( + prompt, + inventory=inventory, + metrics_summary=metrics_summary, + snapshot=snapshot, + workloads=workloads, + ) + if structured: + self._write_json(200, {"answer": structured}) + return context = build_context( prompt, allow_tools=False, @@ -2065,6 +2076,19 @@ def sync_loop(token: str, room_id: str): if not inventory: inventory = _snapshot_inventory(snapshot) workloads = _snapshot_workloads(snapshot) + metrics_summary = snapshot_context(body, snapshot) + structured = structured_answer( + body, + inventory=inventory, + metrics_summary=metrics_summary, + snapshot=snapshot, + workloads=workloads, + ) + if structured: + history[hist_key].append(f"Atlas: {structured}") + history[hist_key] = history[hist_key][-80:] + send_msg(token, rid, structured) + continue context = build_context( body, allow_tools=allow_tools, From 70feb1ef85113424b0595115b093afb58cbfe253 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 13:02:23 -0300 Subject: [PATCH 304/416] atlasbot: refine role and hardware filters --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7fdbf64..ce53f8c 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-37 + checksum/atlasbot-configmap: manual-atlasbot-38 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index ff528ea..a7741cd 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -432,7 +432,10 @@ def _detect_metric(q: str) -> str | None: def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: include: set[str] = set() exclude: set[str] = set() + rpi_specific = "rpi4" in q or "rpi5" in q for hardware, phrases in HARDWARE_HINTS.items(): + if hardware == "rpi" and rpi_specific: + continue for phrase in phrases: if f"non {phrase}" in q or f"non-{phrase}" in q or f"not {phrase}" in q: exclude.add(hardware) @@ -440,6 +443,17 @@ def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: include.add(hardware) return include, exclude + +def _detect_role_filters(q: str) -> set[str]: + roles: set[str] = set() + if "control-plane" in q or "control plane" in q: + roles.add("control-plane") + if "master" in q: + roles.add("master") + if "accelerator" in q: + roles.add("accelerator") + return roles + def _detect_entity(q: str) -> str | None: if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q): return "node" @@ -1125,6 +1139,7 @@ def structured_answer( include_hw, exclude_hw = _detect_hardware_filters(q) nodes_in_query = _extract_titan_nodes(q) only_workers = "worker" in q or "workers" in q + role_filters = _detect_role_filters(q) only_ready: bool | None = None if "not ready" in q or "unready" in q or "down" in q or "missing" in q: only_ready = False @@ -1201,6 +1216,12 @@ def structured_answer( only_ready=only_ready if op in ("status", "count") else None, nodes_in_query=nodes_in_query, ) + if role_filters: + filtered = [ + node + for node in filtered + if role_filters.intersection(set(node.get("roles") or [])) + ] names = [node["name"] for node in filtered] if op == "status": From 31fbe48ca38f0af429c5b0a7bed4aaa0688628da Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 13:13:20 -0300 Subject: [PATCH 305/416] atlasbot: fix metric detection and role counts --- services/comms/scripts/atlasbot/bot.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index a7741cd..739019c 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -424,9 +424,14 @@ def _detect_operation(q: str) -> str | None: return None def _detect_metric(q: str) -> str | None: + tokens = set(_tokens(q)) for metric, phrases in METRIC_HINTS.items(): - if _has_any(q, phrases): - return metric + for phrase in phrases: + if " " in phrase: + if phrase in q: + return metric + elif phrase in tokens: + return metric return None def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: @@ -1249,7 +1254,7 @@ def structured_answer( if missing: msg += f" Missing: {', '.join(missing)}." return _format_confidence(msg, "high") - if not (include_hw or exclude_hw or nodes_in_query or only_workers): + if not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters): return _format_confidence(f"Atlas has {len(names)} nodes.", "high") return _format_confidence(f"Matching nodes: {len(names)}.", "high") From d74277a8bd9c4cf0856e679352a56793fed9a2f8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 13:15:13 -0300 Subject: [PATCH 306/416] comms: roll atlasbot after script update --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index ce53f8c..4e79347 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-38 + checksum/atlasbot-configmap: manual-atlasbot-39 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From d2ade61d88e31ff618490825e34cee2fbc334ee3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 13:17:33 -0300 Subject: [PATCH 307/416] atlasbot: refine ready/pod counts --- services/comms/scripts/atlasbot/bot.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 739019c..f7cfd82 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1102,6 +1102,15 @@ def snapshot_metric_answer( pending = metrics.get("pods_pending") failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") + if "pending" in q and pending is not None: + return _format_confidence(f"Pending pods: {pending:.0f}.", "high") + if "failed" in q and failed is not None: + return _format_confidence(f"Failed pods: {failed:.0f}.", "high") + if "succeeded" in q or "completed" in q: + if succeeded is not None: + return _format_confidence(f"Succeeded pods: {succeeded:.0f}.", "high") + if "running" in q and running is not None: + return _format_confidence(f"Running pods: {running:.0f}.", "high") parts = [] if running is not None: parts.append(f"running {running:.0f}") @@ -1254,6 +1263,10 @@ def structured_answer( if missing: msg += f" Missing: {', '.join(missing)}." return _format_confidence(msg, "high") + if only_ready is True: + return _format_confidence(f"Ready nodes: {len(names)}.", "high") + if only_ready is False: + return _format_confidence(f"Not ready nodes: {len(names)}.", "high") if not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters): return _format_confidence(f"Atlas has {len(names)} nodes.", "high") return _format_confidence(f"Matching nodes: {len(names)}.", "high") From c3b2c0cebb31bae23b82f70ba763eb00ed246903 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 13:18:01 -0300 Subject: [PATCH 308/416] comms: roll atlasbot after answer tweaks --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4e79347..9af766d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-39 + checksum/atlasbot-configmap: manual-atlasbot-40 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 975783a6b94751752bf4b91bb64306de602d877b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 14:09:23 -0300 Subject: [PATCH 309/416] portal: allow longer atlasbot responses --- services/bstein-dev-home/backend-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 26c99e1..ba7d6f8 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -70,7 +70,7 @@ spec: - name: AI_ATLASBOT_ENDPOINT value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer - name: AI_ATLASBOT_TIMEOUT_SEC - value: "5" + value: "30" - name: AI_NODE_NAME valueFrom: fieldRef: From a00bab5ee799893e966edc563192ce2f46d72b79 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 17:12:03 +0000 Subject: [PATCH 310/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index a520991..563b920 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From d03c846779d4c4f217ecb789948e28ba5e6c753a Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 17:12:07 +0000 Subject: [PATCH 311/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 563b920..66d41e3 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From cd45b7faba60cf8fb045e78308ebd70fc5f7866a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 14:38:05 -0300 Subject: [PATCH 312/416] atlasbot: ignore mentions and gate cluster context --- services/comms/scripts/atlasbot/bot.py | 193 +++++++++++++++++++------ 1 file changed, 146 insertions(+), 47 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f7cfd82..26fe7ef 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -121,6 +121,49 @@ METRIC_HINTS = { "pods": ("pods", "pod"), } +CLUSTER_HINT_WORDS = { + "atlas", + "titan", + "cluster", + "k8s", + "kubernetes", + "node", + "nodes", + "pod", + "pods", + "namespace", + "service", + "deployment", + "daemonset", + "statefulset", + "grafana", + "victoria", + "prometheus", + "ariadne", + "mailu", + "nextcloud", + "vaultwarden", + "firefly", + "wger", + "jellyfin", + "planka", + "budget", + "element", + "synapse", + "mas", + "comms", + "longhorn", + "harbor", + "jenkins", + "gitea", + "flux", + "keycloak", + "postgres", + "database", + "db", + "atlasbot", +} + _OLLAMA_LOCK = threading.Lock() HARDWARE_HINTS = { @@ -231,6 +274,18 @@ def is_mentioned(content: dict, body: str) -> bool: return False return any(isinstance(uid, str) and uid.lower() in MENTION_USER_IDS for uid in user_ids) +def _strip_bot_mention(text: str) -> str: + if not text: + return "" + if not MENTION_LOCALPARTS: + return text.strip() + names = [re.escape(name) for name in MENTION_LOCALPARTS if name] + if not names: + return text.strip() + pattern = r"^(?:\s*@?(?:" + "|".join(names) + r")(?::)?\s+)+" + cleaned = re.sub(pattern, "", text, flags=re.IGNORECASE).strip() + return cleaned or text.strip() + # Matrix HTTP helper. def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None): @@ -1780,33 +1835,38 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): if not prompt: self._write_json(400, {"error": "missing_prompt"}) return + cleaned = _strip_bot_mention(prompt) snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) - metrics_summary = snapshot_context(prompt, snapshot) - structured = structured_answer( - prompt, - inventory=inventory, - metrics_summary=metrics_summary, - snapshot=snapshot, - workloads=workloads, - ) - if structured: - self._write_json(200, {"answer": structured}) - return - context = build_context( - prompt, - allow_tools=False, - targets=[], - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - ) - metrics_context, _metrics_fallback = metrics_query_context(prompt, allow_tools=True) - if metrics_context: - context = (context + "\n\n" + metrics_context).strip() if context else metrics_context + cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) + metrics_summary = snapshot_context(cleaned, snapshot) if cluster_query else "" + if cluster_query: + structured = structured_answer( + cleaned, + inventory=inventory, + metrics_summary=metrics_summary, + snapshot=snapshot, + workloads=workloads, + ) + if structured: + self._write_json(200, {"answer": structured}) + return + context = "" + if cluster_query: + context = build_context( + cleaned, + allow_tools=False, + targets=[], + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + ) + metrics_context, _metrics_fallback = metrics_query_context(cleaned, allow_tools=True) + if metrics_context: + context = (context + "\n\n" + metrics_context).strip() if context else metrics_context fallback = "I don't have enough data to answer that." - answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback) + answer = ollama_reply(("http", "internal"), cleaned, context=context, fallback=fallback) self._write_json(200, {"answer": answer}) @@ -1920,6 +1980,37 @@ def _knowledge_intent(prompt: str) -> bool: ) +def _is_cluster_query( + prompt: str, + *, + inventory: list[dict[str, Any]] | None, + workloads: list[dict[str, Any]] | None, +) -> bool: + q = normalize_query(prompt) + if not q: + return False + if TITAN_NODE_RE.search(q): + return True + if any(word in q for word in CLUSTER_HINT_WORDS): + return True + for host_match in HOST_RE.finditer(q): + host = host_match.group(1).lower() + if host.endswith("bstein.dev"): + return True + tokens = set(_tokens(q)) + if workloads: + for entry in workloads: + if not isinstance(entry, dict): + continue + if tokens & _workload_tokens(entry): + return True + if inventory: + names = {node.get("name") for node in inventory if isinstance(node, dict)} + if tokens & {n for n in names if n}: + return True + return False + + def _inventory_summary(inventory: list[dict[str, Any]]) -> str: if not inventory: return "" @@ -1958,7 +2049,8 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " "Translate metrics into natural language instead of echoing raw label/value pairs. " - "Do not answer by only listing runbooks; summarize the cluster first and mention docs only if useful. " + "Do not answer by only listing runbooks; if the question is about Atlas/Othrys, summarize the cluster first and mention docs only if useful. " + "If the question is not about Atlas/Othrys and no cluster context is provided, answer using general knowledge and say when you are unsure. " "If the answer is not grounded in the provided context or tool data, say you do not know. " "End every response with a line: 'Confidence: high|medium|low'." ) @@ -2087,7 +2179,8 @@ def sync_loop(token: str, room_id: str): if not (is_dm or mentioned): continue - lower_body = body.lower() + cleaned_body = _strip_bot_mention(body) + lower_body = cleaned_body.lower() # Only do live cluster introspection in DMs; metrics can be answered when mentioned. allow_tools = is_dm @@ -2101,7 +2194,7 @@ def sync_loop(token: str, room_id: str): # Attempt to scope tools to the most likely workloads when hostnames are mentioned. targets: list[tuple[str, str]] = [] - for m in HOST_RE.finditer(body.lower()): + for m in HOST_RE.finditer(lower_body): host = m.group(1).lower() for ep in _HOST_INDEX.get(host, []): backend = ep.get("backend") or {} @@ -2111,39 +2204,45 @@ def sync_loop(token: str, room_id: str): targets.append((ns, str(w["name"]))) snapshot = _snapshot_state() - inventory = node_inventory_for_prompt(body) + inventory = node_inventory_for_prompt(cleaned_body) if not inventory: inventory = _snapshot_inventory(snapshot) workloads = _snapshot_workloads(snapshot) - metrics_summary = snapshot_context(body, snapshot) - structured = structured_answer( - body, - inventory=inventory, - metrics_summary=metrics_summary, - snapshot=snapshot, - workloads=workloads, - ) + cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) + metrics_summary = snapshot_context(cleaned_body, snapshot) if cluster_query else "" + structured = "" + if cluster_query: + structured = structured_answer( + cleaned_body, + inventory=inventory, + metrics_summary=metrics_summary, + snapshot=snapshot, + workloads=workloads, + ) if structured: history[hist_key].append(f"Atlas: {structured}") history[hist_key] = history[hist_key][-80:] send_msg(token, rid, structured) continue - context = build_context( - body, - allow_tools=allow_tools, - targets=targets, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - ) + context = "" + if cluster_query: + context = build_context( + cleaned_body, + allow_tools=allow_tools, + targets=targets, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + ) if allow_tools and promql: res = vm_query(promql, timeout=20) rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra - metrics_context, _metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics) - if metrics_context: - context = (context + "\n\n" + metrics_context).strip() if context else metrics_context + if cluster_query: + metrics_context, _metrics_fallback = metrics_query_context(cleaned_body, allow_tools=allow_metrics) + if metrics_context: + context = (context + "\n\n" + metrics_context).strip() if context else metrics_context fallback = "I don't have enough data to answer that." @@ -2151,7 +2250,7 @@ def sync_loop(token: str, room_id: str): token, rid, hist_key, - body, + cleaned_body, context=context, fallback=fallback, ) From cb7141dfb63e74219a1852df458213daf3c3ec6e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 14:38:15 -0300 Subject: [PATCH 313/416] comms: roll atlasbot for mention stripping --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 9af766d..aa91fdf 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-40 + checksum/atlasbot-configmap: manual-atlasbot-41 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 92f4137e9c161bf5725469f595f771dd0eba78a3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 14:54:09 -0300 Subject: [PATCH 314/416] atlasbot: simplify cluster gating and context --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 197 ++++++++++++++++-------- 2 files changed, 133 insertions(+), 66 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index aa91fdf..a2b0a3c 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-41 + checksum/atlasbot-configmap: manual-atlasbot-42 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 26fe7ef..64097da 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -65,6 +65,16 @@ STOPWORDS = { "help", "atlas", "othrys", + "system", + "systems", + "service", + "services", + "app", + "apps", + "platform", + "software", + "tool", + "tools", } METRIC_HINT_WORDS = { @@ -129,6 +139,8 @@ CLUSTER_HINT_WORDS = { "kubernetes", "node", "nodes", + "worker", + "workers", "pod", "pods", "namespace", @@ -162,6 +174,11 @@ CLUSTER_HINT_WORDS = { "database", "db", "atlasbot", + "jetson", + "rpi", + "raspberry", + "amd64", + "arm64", } _OLLAMA_LOCK = threading.Lock() @@ -1840,18 +1857,6 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) - metrics_summary = snapshot_context(cleaned, snapshot) if cluster_query else "" - if cluster_query: - structured = structured_answer( - cleaned, - inventory=inventory, - metrics_summary=metrics_summary, - snapshot=snapshot, - workloads=workloads, - ) - if structured: - self._write_json(200, {"answer": structured}) - return context = "" if cluster_query: context = build_context( @@ -1862,11 +1867,14 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot=snapshot, workloads=workloads, ) - metrics_context, _metrics_fallback = metrics_query_context(cleaned, allow_tools=True) - if metrics_context: - context = (context + "\n\n" + metrics_context).strip() if context else metrics_context fallback = "I don't have enough data to answer that." - answer = ollama_reply(("http", "internal"), cleaned, context=context, fallback=fallback) + answer = ollama_reply( + ("http", "internal"), + cleaned, + context=context, + fallback=fallback, + use_history=False, + ) self._write_json(200, {"answer": answer}) @@ -1897,6 +1905,15 @@ def build_context( if facts: parts.append(facts) + snapshot_json = snapshot_compact_context( + prompt, + snapshot, + inventory=inventory, + workloads=workloads, + ) + if snapshot_json: + parts.append(snapshot_json) + endpoints, edges = catalog_hints(prompt) if endpoints: parts.append(endpoints) @@ -1925,15 +1942,6 @@ def build_context( if flux_bad: parts.append("Flux (not ready):\n" + flux_bad) - p_l = (prompt or "").lower() - if any(w in p_l for w in METRIC_HINT_WORDS): - restarts = vm_top_restarts(1) - if restarts: - parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts) - snap = vm_cluster_snapshot() - if snap: - parts.append("VictoriaMetrics (cluster snapshot):\n" + snap) - return "\n\n".join([p for p in parts if p]).strip() @@ -1963,6 +1971,68 @@ def snapshot_context(prompt: str, snapshot: dict[str, Any] | None) -> str: parts.append(f"Snapshot: workload={match}.") return "\n".join(parts).strip() +def _compact_nodes_detail(snapshot: dict[str, Any]) -> list[dict[str, Any]]: + details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else [] + output: list[dict[str, Any]] = [] + for node in details: + if not isinstance(node, dict): + continue + name = node.get("name") + if not name: + continue + output.append( + { + "name": name, + "ready": node.get("ready"), + "hardware": node.get("hardware"), + "arch": node.get("arch"), + "roles": node.get("roles"), + "is_worker": node.get("is_worker"), + "os": node.get("os"), + "kernel": node.get("kernel"), + "kubelet": node.get("kubelet"), + "container_runtime": node.get("container_runtime"), + } + ) + return output + +def _compact_metrics(snapshot: dict[str, Any]) -> dict[str, Any]: + metrics = snapshot.get("metrics") if isinstance(snapshot.get("metrics"), dict) else {} + return { + "pods_running": metrics.get("pods_running"), + "pods_pending": metrics.get("pods_pending"), + "pods_failed": metrics.get("pods_failed"), + "pods_succeeded": metrics.get("pods_succeeded"), + "postgres_connections": metrics.get("postgres_connections"), + "hottest_nodes": metrics.get("hottest_nodes"), + "node_usage": metrics.get("node_usage"), + "top_restarts_1h": metrics.get("top_restarts_1h"), + } + +def snapshot_compact_context( + prompt: str, + snapshot: dict[str, Any] | None, + *, + inventory: list[dict[str, Any]] | None, + workloads: list[dict[str, Any]] | None, +) -> str: + if not snapshot: + return "" + compact = { + "collected_at": snapshot.get("collected_at"), + "nodes_summary": snapshot.get("nodes_summary"), + "expected_workers": expected_worker_nodes_from_metrics(), + "nodes_detail": _compact_nodes_detail(snapshot), + "workloads": _workloads_for_prompt(prompt, workloads or [], limit=40) if workloads else [], + "metrics": _compact_metrics(snapshot), + "flux": snapshot.get("flux"), + "errors": snapshot.get("errors"), + } + text = json.dumps(compact, ensure_ascii=False) + if len(text) > MAX_FACTS_CHARS: + text = text[: MAX_FACTS_CHARS - 3].rstrip() + "..." + return "Cluster snapshot (JSON):\n" + text + def _knowledge_intent(prompt: str) -> bool: q = normalize_query(prompt) @@ -1998,16 +2068,8 @@ def _is_cluster_query( if host.endswith("bstein.dev"): return True tokens = set(_tokens(q)) - if workloads: - for entry in workloads: - if not isinstance(entry, dict): - continue - if tokens & _workload_tokens(entry): - return True - if inventory: - names = {node.get("name") for node in inventory if isinstance(node, dict)} - if tokens & {n for n in names if n}: - return True + if _NAME_INDEX and tokens & _NAME_INDEX: + return True return False @@ -2037,7 +2099,7 @@ def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str: summary = "\n".join(parts).strip() return _format_confidence(summary, "medium") if summary else "" -def _ollama_call(hist_key, prompt: str, *, context: str) -> str: +def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = True) -> str: system = ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " @@ -2062,7 +2124,8 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: system_content += "\n\nContext (grounded):\n" + context[:MAX_CONTEXT_CHARS] messages: list[dict[str, str]] = [{"role": "system", "content": system_content}] - messages.extend(_history_to_messages(history[hist_key][-24:])) + if use_history: + messages.extend(_history_to_messages(history[hist_key][-24:])) messages.append({"role": "user", "content": prompt}) payload = {"model": MODEL, "messages": messages, "stream": False} @@ -2082,31 +2145,55 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: else: raw_reply = data.get("response") or data.get("reply") or data reply = _normalize_reply(raw_reply) or "I'm here to help." - history[hist_key].append(f"Atlas: {reply}") + if use_history: + history[hist_key].append(f"Atlas: {reply}") return reply finally: if lock: lock.release() -def ollama_reply(hist_key, prompt: str, *, context: str, fallback: str = "") -> str: +def ollama_reply( + hist_key, + prompt: str, + *, + context: str, + fallback: str = "", + use_history: bool = True, +) -> str: last_error = None for attempt in range(max(1, OLLAMA_RETRIES + 1)): try: - return _ollama_call(hist_key, prompt, context=context) + return _ollama_call(hist_key, prompt, context=context, use_history=use_history) except Exception as exc: # noqa: BLE001 last_error = exc time.sleep(min(4, 2 ** attempt)) if fallback: - history[hist_key].append(f"Atlas: {fallback}") + if use_history: + history[hist_key].append(f"Atlas: {fallback}") return fallback return "I don't have enough data to answer that." -def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, context: str, fallback: str) -> str: +def ollama_reply_with_thinking( + token: str, + room: str, + hist_key, + prompt: str, + *, + context: str, + fallback: str, + use_history: bool = True, +) -> str: result: dict[str, str] = {"reply": ""} done = threading.Event() def worker(): - result["reply"] = ollama_reply(hist_key, prompt, context=context, fallback=fallback) + result["reply"] = ollama_reply( + hist_key, + prompt, + context=context, + fallback=fallback, + use_history=use_history, + ) done.set() thread = threading.Thread(target=worker, daemon=True) @@ -2182,9 +2269,8 @@ def sync_loop(token: str, room_id: str): cleaned_body = _strip_bot_mention(body) lower_body = cleaned_body.lower() - # Only do live cluster introspection in DMs; metrics can be answered when mentioned. + # Only do live cluster introspection in DMs. allow_tools = is_dm - allow_metrics = is_dm or mentioned promql = "" if allow_tools: @@ -2209,21 +2295,6 @@ def sync_loop(token: str, room_id: str): inventory = _snapshot_inventory(snapshot) workloads = _snapshot_workloads(snapshot) cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) - metrics_summary = snapshot_context(cleaned_body, snapshot) if cluster_query else "" - structured = "" - if cluster_query: - structured = structured_answer( - cleaned_body, - inventory=inventory, - metrics_summary=metrics_summary, - snapshot=snapshot, - workloads=workloads, - ) - if structured: - history[hist_key].append(f"Atlas: {structured}") - history[hist_key] = history[hist_key][-80:] - send_msg(token, rid, structured) - continue context = "" if cluster_query: context = build_context( @@ -2239,11 +2310,6 @@ def sync_loop(token: str, room_id: str): rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra - if cluster_query: - metrics_context, _metrics_fallback = metrics_query_context(cleaned_body, allow_tools=allow_metrics) - if metrics_context: - context = (context + "\n\n" + metrics_context).strip() if context else metrics_context - fallback = "I don't have enough data to answer that." reply = ollama_reply_with_thinking( @@ -2253,6 +2319,7 @@ def sync_loop(token: str, room_id: str): cleaned_body, context=context, fallback=fallback, + use_history=cluster_query, ) send_msg(token, rid, reply) From 18a4c583389e67dcd95b92318f8f8b1a55dbf807 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 17:58:07 +0000 Subject: [PATCH 315/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 66d41e3..04d7e82 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 1682ccfb25f63dc5eb11a5e5b62a632fefe76960 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 17:58:11 +0000 Subject: [PATCH 316/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 04d7e82..bb9e5f0 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From ca3cfaf1fce81460d2fb2f1308c7b2d975dbc583 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:00:36 -0300 Subject: [PATCH 317/416] atlasbot: tighten cluster intent and snapshot framing --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index a2b0a3c..d24cba2 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-42 + checksum/atlasbot-configmap: manual-atlasbot-43 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 64097da..bee72e9 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2104,6 +2104,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " "Use the provided context and facts as your source of truth. " + "If the context includes a cluster snapshot, treat the question as about the Atlas/Othrys cluster even if the prompt is ambiguous. " "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. " "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. " "Prefer exact repo paths and Kubernetes resource names when relevant. " From f649a6a9a260ae14a0fbd84e6231ac4eea85a7de Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:04:10 -0300 Subject: [PATCH 318/416] atlasbot: force cluster intent in prompts --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d24cba2..f4e7f7d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-43 + checksum/atlasbot-configmap: manual-atlasbot-44 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index bee72e9..4316fe0 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1868,9 +1868,12 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): workloads=workloads, ) fallback = "I don't have enough data to answer that." + llm_prompt = cleaned + if cluster_query: + llm_prompt = f"Atlas cluster question: {cleaned}" answer = ollama_reply( ("http", "internal"), - cleaned, + llm_prompt, context=context, fallback=fallback, use_history=False, @@ -2313,11 +2316,14 @@ def sync_loop(token: str, room_id: str): context = (context + "\n\n" + extra).strip() if context else extra fallback = "I don't have enough data to answer that." + llm_prompt = cleaned_body + if cluster_query: + llm_prompt = f"Atlas cluster question: {cleaned_body}" reply = ollama_reply_with_thinking( token, rid, hist_key, - cleaned_body, + llm_prompt, context=context, fallback=fallback, use_history=cluster_query, From dea70df20978c717b7fde554a200b98b8d35f427 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:07:28 -0300 Subject: [PATCH 319/416] atlasbot: strengthen cluster disambiguation --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index f4e7f7d..de50c37 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-44 + checksum/atlasbot-configmap: manual-atlasbot-45 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 4316fe0..62304fa 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1870,7 +1870,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): fallback = "I don't have enough data to answer that." llm_prompt = cleaned if cluster_query: - llm_prompt = f"Atlas cluster question: {cleaned}" + llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned}" answer = ollama_reply( ("http", "internal"), llm_prompt, @@ -2108,6 +2108,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru "Be helpful, direct, and concise. " "Use the provided context and facts as your source of truth. " "If the context includes a cluster snapshot, treat the question as about the Atlas/Othrys cluster even if the prompt is ambiguous. " + "When a cluster snapshot is provided, never answer about unrelated meanings of 'Atlas' (maps, mythology, Apache Atlas, etc). " "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. " "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. " "Prefer exact repo paths and Kubernetes resource names when relevant. " @@ -2318,7 +2319,7 @@ def sync_loop(token: str, room_id: str): llm_prompt = cleaned_body if cluster_query: - llm_prompt = f"Atlas cluster question: {cleaned_body}" + llm_prompt = f\"Atlas cluster question (use the cluster snapshot context): {cleaned_body}\" reply = ollama_reply_with_thinking( token, rid, From 864f1cab209d6e51b74c35b27457e0b004bd35e5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:10:03 -0300 Subject: [PATCH 320/416] atlasbot: fix prompt formatting --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index de50c37..d4d6668 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-45 + checksum/atlasbot-configmap: manual-atlasbot-46 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 62304fa..429fa31 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2319,7 +2319,7 @@ def sync_loop(token: str, room_id: str): llm_prompt = cleaned_body if cluster_query: - llm_prompt = f\"Atlas cluster question (use the cluster snapshot context): {cleaned_body}\" + llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned_body}" reply = ollama_reply_with_thinking( token, rid, From 241a8889ee42ebacbb5e6073cda598f8858fd50d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:12:47 -0300 Subject: [PATCH 321/416] atlasbot: send snapshot as explicit context --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d4d6668..47d0992 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-46 + checksum/atlasbot-configmap: manual-atlasbot-47 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 429fa31..351bb40 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2124,11 +2124,9 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru endpoint = _ollama_endpoint() if not endpoint: raise RuntimeError("ollama endpoint missing") - system_content = system + messages: list[dict[str, str]] = [{"role": "system", "content": system}] if context: - system_content += "\n\nContext (grounded):\n" + context[:MAX_CONTEXT_CHARS] - - messages: list[dict[str, str]] = [{"role": "system", "content": system_content}] + messages.append({"role": "user", "content": "Context (grounded):\n" + context[:MAX_CONTEXT_CHARS]}) if use_history: messages.extend(_history_to_messages(history[hist_key][-24:])) messages.append({"role": "user", "content": prompt}) From b7792d30f1d0d6a3d59110e11ce6464e6c346b1a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:30:43 -0300 Subject: [PATCH 322/416] atlasbot: answer cluster queries without llm --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 284 ++++++++++++++++++++++-- 2 files changed, 263 insertions(+), 23 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 47d0992..69b30e4 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-47 + checksum/atlasbot-configmap: manual-atlasbot-48 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 351bb40..f0bf008 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -532,7 +532,7 @@ def _detect_role_filters(q: str) -> set[str]: return roles def _detect_entity(q: str) -> str | None: - if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q): + if "node" in q or "nodes" in q or "worker" in q or "hardware" in q or "architecture" in q or TITAN_NODE_RE.search(q): return "node" if "pod" in q or "pods" in q: return "pod" @@ -1152,6 +1152,15 @@ def snapshot_metric_answer( if include_hw: scope = f" among {' and '.join(sorted(include_hw))}" answer = f"Hottest node{scope}: {node} ({value})." + if allowed_nodes and len(allowed_nodes) != len(inventory): + overall = _node_usage_top(usage, allowed_nodes=None) + if overall and overall[0] != node: + overall_val = _format_metric_value( + str(overall[1]), + percent=percent, + rate=metric in {"net", "io"}, + ) + answer += f" Overall hottest: {overall[0]} ({overall_val})." return _format_confidence(answer, "high") if metric == "connections" or "postgres" in q: @@ -1358,6 +1367,219 @@ def structured_answer( return "" + +def _nodes_summary_line(inventory: list[dict[str, Any]], snapshot: dict[str, Any] | None) -> str: + summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {} + nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {} + total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total") + ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready") + not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready") + if total is None: + total = len(inventory) + ready = len([n for n in inventory if n.get("ready") is True]) + not_ready = len([n for n in inventory if n.get("ready") is False]) + if total is None: + return "" + return f"Atlas cluster has {total} nodes ({ready} ready, {not_ready} not ready)." + + +def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str: + if not inventory: + return "" + groups = _group_nodes(inventory) + parts: list[str] = [] + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + nodes = groups.get(key) or [] + if nodes: + parts.append(f"{key}={len(nodes)}") + if not parts: + return "" + return "Hardware mix: " + ", ".join(parts) + "." + + +def _os_mix_line(snapshot: dict[str, Any] | None) -> str: + if not snapshot: + return "" + details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else [] + counts: dict[str, int] = collections.Counter() + for node in details: + if not isinstance(node, dict): + continue + os_name = (node.get("os") or "").strip() + if os_name: + counts[os_name] += 1 + if not counts: + return "" + parts = [f"{os_name}={count}" for os_name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))] + return "OS mix: " + ", ".join(parts[:5]) + "." + + +def _pods_summary_line(metrics: dict[str, Any]) -> str: + if not metrics: + return "" + running = metrics.get("pods_running") + pending = metrics.get("pods_pending") + failed = metrics.get("pods_failed") + succeeded = metrics.get("pods_succeeded") + parts: list[str] = [] + if running is not None: + parts.append(f"{running:.0f} running") + if pending is not None: + parts.append(f"{pending:.0f} pending") + if failed is not None: + parts.append(f"{failed:.0f} failed") + if succeeded is not None: + parts.append(f"{succeeded:.0f} succeeded") + if not parts: + return "" + return "Pods: " + ", ".join(parts) + "." + + +def _postgres_summary_line(metrics: dict[str, Any]) -> str: + if not metrics: + return "" + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} + if not postgres: + return "" + used = postgres.get("used") + max_conn = postgres.get("max") + hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} + parts: list[str] = [] + if used is not None and max_conn is not None: + parts.append(f"{used:.0f}/{max_conn:.0f} connections") + if hottest.get("label"): + hot_val = hottest.get("value") + hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else "" + parts.append(f"hottest {hottest.get('label')} ({hot_val_str})") + if not parts: + return "" + return "Postgres: " + ", ".join(parts) + "." + + +def _hottest_summary_line(metrics: dict[str, Any]) -> str: + if not metrics: + return "" + hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + if not hottest: + return "" + parts: list[str] = [] + for key in ("cpu", "ram", "net", "io"): + entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} + node = entry.get("node") + value = entry.get("value") + if node and value is not None: + value_fmt = _format_metric_value( + str(value), + percent=key in ("cpu", "ram"), + rate=key in ("net", "io"), + ) + parts.append(f"{key.upper()} {node} ({value_fmt})") + if not parts: + return "" + return "Hottest nodes: " + "; ".join(parts) + "." + + +def cluster_overview_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, +) -> str: + if not inventory and not snapshot: + return "" + q = normalize_query(prompt) + metrics = _snapshot_metrics(snapshot) + lines: list[str] = [] + + nodes_line = _nodes_summary_line(inventory, snapshot) + if nodes_line: + lines.append(nodes_line) + + if any(word in q for word in ("hardware", "architecture", "nodes", "node", "cluster", "atlas", "titan", "lab")): + hw_line = _hardware_mix_line(inventory) + if hw_line: + lines.append(hw_line) + os_line = _os_mix_line(snapshot) + if os_line: + lines.append(os_line) + + if any( + word in q + for word in ( + "interesting", + "status", + "health", + "overview", + "summary", + "tell me", + "what do you know", + "about", + "pods", + "postgres", + "connections", + "hottest", + "cpu", + "ram", + "memory", + "net", + "network", + "io", + "disk", + "busy", + "load", + "usage", + "utilization", + ) + ): + pods_line = _pods_summary_line(metrics) + if pods_line: + lines.append(pods_line) + hottest_line = _hottest_summary_line(metrics) + if hottest_line: + lines.append(hottest_line) + postgres_line = _postgres_summary_line(metrics) + if postgres_line: + lines.append(postgres_line) + + if not lines: + return "" + return "Based on the snapshot, " + "\n".join(lines) + + +def cluster_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]] | None, +) -> str: + metrics_summary = snapshot_context(prompt, snapshot) + structured = structured_answer( + prompt, + inventory=inventory, + metrics_summary=metrics_summary, + snapshot=snapshot, + workloads=workloads, + ) + if structured: + return structured + + overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot) + if overview: + kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else "" + if kb_titles: + overview = overview + "\n" + kb_titles + return _format_confidence(overview, "medium") + + kb_titles = kb_retrieve_titles(prompt, limit=4) + if kb_titles: + return _format_confidence(kb_titles, "low") + + if metrics_summary: + return _format_confidence(metrics_summary, "low") + + return "" + def _metric_tokens(entry: dict[str, Any]) -> str: parts: list[str] = [] for key in ("panel_title", "dashboard", "description"): @@ -1868,16 +2090,24 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): workloads=workloads, ) fallback = "I don't have enough data to answer that." - llm_prompt = cleaned if cluster_query: - llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned}" - answer = ollama_reply( - ("http", "internal"), - llm_prompt, - context=context, - fallback=fallback, - use_history=False, - ) + answer = cluster_answer( + cleaned, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + ) + if not answer: + answer = fallback + else: + llm_prompt = cleaned + answer = ollama_reply( + ("http", "internal"), + llm_prompt, + context=context, + fallback=fallback, + use_history=False, + ) self._write_json(200, {"answer": answer}) @@ -2044,6 +2274,7 @@ def _knowledge_intent(prompt: str) -> bool: for phrase in ( "what do you know", "tell me about", + "interesting", "overview", "summary", "describe", @@ -2312,21 +2543,30 @@ def sync_loop(token: str, room_id: str): res = vm_query(promql, timeout=20) rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered - context = (context + "\n\n" + extra).strip() if context else extra + send_msg(token, rid, extra) + continue fallback = "I don't have enough data to answer that." - llm_prompt = cleaned_body if cluster_query: - llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned_body}" - reply = ollama_reply_with_thinking( - token, - rid, - hist_key, - llm_prompt, - context=context, - fallback=fallback, - use_history=cluster_query, - ) + reply = cluster_answer( + cleaned_body, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + ) + if not reply: + reply = fallback + else: + llm_prompt = cleaned_body + reply = ollama_reply_with_thinking( + token, + rid, + hist_key, + llm_prompt, + context=context, + fallback=fallback, + use_history=False, + ) send_msg(token, rid, reply) def login_with_retry(): From 631bd09778cb72faa7dda83732693c9c8715ccb9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:36:08 -0300 Subject: [PATCH 323/416] atlasbot: return structured cluster summaries --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 69b30e4..0685626 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-48 + checksum/atlasbot-configmap: manual-atlasbot-49 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f0bf008..e936b95 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1268,7 +1268,17 @@ def structured_answer( node_regex = "|".join([n["name"] for n in scoped]) expr = _apply_node_filter(expr, node_regex) res = vm_query(expr, timeout=20) - answer = _format_metric_answer(entry, res) + answer = "" + if op == "top" or "hottest" in (entry.get("panel_title") or "").lower(): + node, val = _primary_series_metric(res) + if node and val is not None: + percent = _metric_expr_uses_percent(entry) + value_fmt = _format_metric_value(val or "", percent=percent) + metric_label = (metric or "").upper() + label = f"{metric_label} node" if metric_label else "node" + answer = f"Hottest {label}: {node} ({value_fmt})." + if not answer: + answer = _format_metric_answer(entry, res) if answer: scope_parts: list[str] = [] if include_hw: @@ -1292,8 +1302,8 @@ def structured_answer( percent = _metric_expr_uses_percent(entry) base_val_fmt = _format_metric_value(base_val or "", percent=percent) overall_note = f" Overall hottest node: {base_node} ({base_val_fmt})." - return f"Among {scope} nodes, {answer}{overall_note}" - return answer + return _format_confidence(f"Among {scope} nodes, {answer}{overall_note}", "high") + return _format_confidence(answer, "high") if metrics_summary: return metrics_summary @@ -1408,7 +1418,7 @@ def _os_mix_line(snapshot: dict[str, Any] | None) -> str: os_name = (node.get("os") or "").strip() if os_name: counts[os_name] += 1 - if not counts: + if not counts or (len(counts) == 1 and "linux" in counts): return "" parts = [f"{os_name}={count}" for os_name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))] return "OS mix: " + ", ".join(parts[:5]) + "." From 3f159c6c83ed33bfad49a415fe92904a48144350 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:42:31 -0300 Subject: [PATCH 324/416] atlasbot: improve workload matching and fallbacks --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 0685626..bccf752 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-49 + checksum/atlasbot-configmap: manual-atlasbot-50 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e936b95..34e27cf 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1031,6 +1031,12 @@ def _workload_tokens(entry: dict[str, Any]) -> set[str]: return tokens +def _workload_query_target(prompt: str) -> str: + tokens = set(_tokens(prompt)) + matches = sorted(tokens & _NAME_INDEX) if _NAME_INDEX else [] + return matches[0] if matches else "" + + def _select_workload(prompt: str, workloads: list[dict[str, Any]]) -> dict[str, Any] | None: q_tokens = set(_tokens(prompt)) if not q_tokens: @@ -1041,6 +1047,12 @@ def _select_workload(prompt: str, workloads: list[dict[str, Any]]) -> dict[str, continue tokens = _workload_tokens(entry) score = len(tokens & q_tokens) + name = (entry.get("workload") or "").lower() + namespace = (entry.get("namespace") or "").lower() + if name and name in q_tokens: + score += 5 + if namespace and namespace in q_tokens: + score += 3 if score: scored.append((score, entry)) if not scored: @@ -1574,6 +1586,14 @@ def cluster_answer( if structured: return structured + q = normalize_query(prompt) + workload_target = _workload_query_target(prompt) + if workload_target and any(word in q for word in ("where", "run", "running", "host", "node")): + return _format_confidence( + f"I don't have workload placement data for {workload_target} in the current snapshot.", + "low", + ) + overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot) if overview: kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else "" From 354275f3ad9fa195d47916b84b73f8ce8d0fef74 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:45:18 -0300 Subject: [PATCH 325/416] atlasbot: avoid namespace-only workload matches --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index bccf752..301a474 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-50 + checksum/atlasbot-configmap: manual-atlasbot-51 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 34e27cf..d36844b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1071,11 +1071,17 @@ def workload_answer(prompt: str, workloads: list[dict[str, Any]]) -> str: q = normalize_query(prompt) if not any(word in q for word in ("where", "which", "node", "run", "running", "host", "located")): return "" + target = _workload_query_target(prompt) entry = _select_workload(prompt, workloads) if not entry: return "" workload = entry.get("workload") or "" namespace = entry.get("namespace") or "" + if target: + workload_l = str(workload).lower() + namespace_l = str(namespace).lower() + if workload_l != target and namespace_l == target and "namespace" not in q and "workload" not in q: + return "" nodes = entry.get("nodes") if isinstance(entry.get("nodes"), dict) else {} primary = entry.get("primary_node") or "" if not workload or not nodes: From dedf56699313fb191b73b80e0642cc39f90ae818 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 18:57:30 +0000 Subject: [PATCH 326/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index e4580aa..a1ca583 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-58 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 62a423f32c6cf5c04089ae357dc7ed9da3ceaf4f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:19:30 -0300 Subject: [PATCH 327/416] monitoring: fix jetson gpu metrics --- scripts/dashboards_render_atlas.py | 9 ++++++- services/monitoring/dashboards/atlas-gpu.json | 2 +- .../monitoring/grafana-dashboard-gpu.yaml | 2 +- .../jetson-tegrastats-exporter.yaml | 2 +- .../scripts/jetson_tegrastats_exporter.py | 25 +++++++++++++------ 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 675fec5..6ad4321 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -221,6 +221,13 @@ def jetson_gpu_util_by_node(): return 'max by (node) (jetson_gr3d_freq_percent{node!=""})' +def jetson_gpu_util_by_hostname(): + return ( + 'label_replace(max by (node) (jetson_gr3d_freq_percent{node!=""}), ' + '"Hostname", "$1", "node", "(.*)")' + ) + + def jetson_gpu_requests(scope_var): return ( "sum by (namespace,node) (" @@ -2688,7 +2695,7 @@ def build_gpu_dashboard(): timeseries_panel( 3, "GPU Util by Node", - 'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})', + f'(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{{pod!=""}})) or ({jetson_gpu_util_by_hostname()})', {"h": 8, "w": 12, "x": 0, "y": 8}, unit="percent", legend="{{Hostname}}", diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 6b76a5c..36ab9e5 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -126,7 +126,7 @@ }, "targets": [ { - "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 46b25cd..bb395db 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -135,7 +135,7 @@ data: }, "targets": [ { - "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 8584eba..0074394 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "1" + monitoring.bstein.dev/restart-rev: "2" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index c237ec5..3858d96 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -4,7 +4,7 @@ import re import socketserver import subprocess import threading -from time import time +from time import sleep, time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename @@ -20,6 +20,7 @@ METRICS = { LOCK = threading.Lock() def parse_line(line: str): + line = line.strip() updates = {} m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line) if m: @@ -34,7 +35,7 @@ def parse_line(line: str): if m: updates["ram_used_mb"] = float(m.group(1)) updates["ram_total_mb"] = float(m.group(2)) - m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line) + m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line) if m: updates["power_5v_in_mw"] = float(m.group(1)) with LOCK: @@ -42,15 +43,23 @@ def parse_line(line: str): METRICS["last_scrape_ts"] = time() def run_tegrastats(): - proc = subprocess.Popen( - ["/host/usr/bin/tegrastats", "--interval", "1000"], - stdout=subprocess.PIPE, + logfile = "/tmp/tegrastats.log" + subprocess.Popen( + ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", logfile], + stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, text=True, - bufsize=1, ) - for line in proc.stdout: - parse_line(line) + while not os.path.exists(logfile): + sleep(0.1) + with open(logfile, "r", encoding="utf-8", errors="ignore") as handle: + handle.seek(0, os.SEEK_END) + while True: + line = handle.readline() + if not line: + sleep(0.2) + continue + parse_line(line) class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): From 19512910904b75a36e627377a2abdb190aa08ece Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:23:23 -0300 Subject: [PATCH 328/416] monitoring: refresh jetson stats on scrape --- .../jetson-tegrastats-exporter.yaml | 2 +- .../scripts/jetson_tegrastats_exporter.py | 37 +++++++++++-------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 0074394..a6612c6 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "2" + monitoring.bstein.dev/restart-rev: "3" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index 3858d96..4cbf6ca 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -4,10 +4,11 @@ import re import socketserver import subprocess import threading -from time import sleep, time +from time import time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename +LOGFILE = "/tmp/tegrastats.log" METRICS = { "gr3d_freq_percent": 0.0, "gpu_temp_c": 0.0, @@ -42,24 +43,28 @@ def parse_line(line: str): METRICS.update(updates) METRICS["last_scrape_ts"] = time() -def run_tegrastats(): - logfile = "/tmp/tegrastats.log" +def start_tegrastats(): subprocess.Popen( - ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", logfile], + ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", LOGFILE], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, text=True, ) - while not os.path.exists(logfile): - sleep(0.1) - with open(logfile, "r", encoding="utf-8", errors="ignore") as handle: - handle.seek(0, os.SEEK_END) - while True: - line = handle.readline() - if not line: - sleep(0.2) - continue - parse_line(line) + + +def refresh_from_log(): + if not os.path.exists(LOGFILE): + return + try: + with open(LOGFILE, "rb") as handle: + handle.seek(0, os.SEEK_END) + size = handle.tell() + handle.seek(max(size - 4096, 0), os.SEEK_SET) + tail = handle.read().decode("utf-8", errors="ignore").splitlines() + if tail: + parse_line(tail[-1]) + except OSError: + return class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): @@ -67,6 +72,7 @@ class Handler(http.server.BaseHTTPRequestHandler): self.send_response(404) self.end_headers() return + refresh_from_log() with LOCK: metrics = METRICS.copy() out = [] @@ -85,7 +91,6 @@ class Handler(http.server.BaseHTTPRequestHandler): return if __name__ == "__main__": - t = threading.Thread(target=run_tegrastats, daemon=True) - t.start() + start_tegrastats() with socketserver.TCPServer(("", PORT), Handler) as httpd: httpd.serve_forever() From 246ed6617e8b44caf1adee7423b790fa93d47861 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:27:45 -0300 Subject: [PATCH 329/416] monitoring: read jetson stats on demand --- .../jetson-tegrastats-exporter.yaml | 2 +- .../scripts/jetson_tegrastats_exporter.py | 27 +++++++++---------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index a6612c6..d80d83e 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "3" + monitoring.bstein.dev/restart-rev: "4" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index 4cbf6ca..204e439 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -3,13 +3,12 @@ import os import re import socketserver import subprocess -import threading from time import time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename LOGFILE = "/tmp/tegrastats.log" -METRICS = { +BASE_METRICS = { "gr3d_freq_percent": 0.0, "gpu_temp_c": 0.0, "cpu_temp_c": 0.0, @@ -18,9 +17,8 @@ METRICS = { "power_5v_in_mw": 0.0, "last_scrape_ts": 0.0, } -LOCK = threading.Lock() -def parse_line(line: str): +def parse_line(line: str) -> dict: line = line.strip() updates = {} m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line) @@ -39,9 +37,7 @@ def parse_line(line: str): m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line) if m: updates["power_5v_in_mw"] = float(m.group(1)) - with LOCK: - METRICS.update(updates) - METRICS["last_scrape_ts"] = time() + return updates def start_tegrastats(): subprocess.Popen( @@ -52,19 +48,18 @@ def start_tegrastats(): ) -def refresh_from_log(): +def read_latest_line() -> str: if not os.path.exists(LOGFILE): - return + return "" try: with open(LOGFILE, "rb") as handle: handle.seek(0, os.SEEK_END) size = handle.tell() handle.seek(max(size - 4096, 0), os.SEEK_SET) tail = handle.read().decode("utf-8", errors="ignore").splitlines() - if tail: - parse_line(tail[-1]) + return tail[-1] if tail else "" except OSError: - return + return "" class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): @@ -72,9 +67,11 @@ class Handler(http.server.BaseHTTPRequestHandler): self.send_response(404) self.end_headers() return - refresh_from_log() - with LOCK: - metrics = METRICS.copy() + metrics = BASE_METRICS.copy() + line = read_latest_line() + if line: + metrics.update(parse_line(line)) + metrics["last_scrape_ts"] = time() out = [] label = f'{{node="{NODE_NAME}"}}' for k, v in metrics.items(): From a7f3d49fea6c867c8c73e078669ed51e27181d45 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:34:31 -0300 Subject: [PATCH 330/416] monitoring: read tegrastats per scrape --- .../jetson-tegrastats-exporter.yaml | 2 +- .../scripts/jetson_tegrastats_exporter.py | 32 ++++++++----------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index d80d83e..3679938 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "4" + monitoring.bstein.dev/restart-rev: "5" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index 204e439..8314ad7 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -7,7 +7,6 @@ from time import time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename -LOGFILE = "/tmp/tegrastats.log" BASE_METRICS = { "gr3d_freq_percent": 0.0, "gpu_temp_c": 0.0, @@ -39,25 +38,21 @@ def parse_line(line: str) -> dict: updates["power_5v_in_mw"] = float(m.group(1)) return updates -def start_tegrastats(): - subprocess.Popen( - ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", LOGFILE], - stdout=subprocess.DEVNULL, - stderr=subprocess.STDOUT, - text=True, - ) - - def read_latest_line() -> str: - if not os.path.exists(LOGFILE): - return "" try: - with open(LOGFILE, "rb") as handle: - handle.seek(0, os.SEEK_END) - size = handle.tell() - handle.seek(max(size - 4096, 0), os.SEEK_SET) - tail = handle.read().decode("utf-8", errors="ignore").splitlines() - return tail[-1] if tail else "" + proc = subprocess.Popen( + ["/host/usr/bin/tegrastats", "--interval", "1000"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + line = proc.stdout.readline() + proc.terminate() + try: + proc.wait(timeout=1) + except subprocess.TimeoutExpired: + proc.kill() + return line except OSError: return "" @@ -88,6 +83,5 @@ class Handler(http.server.BaseHTTPRequestHandler): return if __name__ == "__main__": - start_tegrastats() with socketserver.TCPServer(("", PORT), Handler) as httpd: httpd.serve_forever() From dfb295e5f0ccef0d6052656770a8a5a2da087eb0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:38:09 -0300 Subject: [PATCH 331/416] monitoring: expose jetson scrape line length --- services/monitoring/jetson-tegrastats-exporter.yaml | 2 +- services/monitoring/scripts/jetson_tegrastats_exporter.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 3679938..6b0ce37 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "5" + monitoring.bstein.dev/restart-rev: "6" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index 8314ad7..284d5ce 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -14,6 +14,7 @@ BASE_METRICS = { "ram_used_mb": 0.0, "ram_total_mb": 0.0, "power_5v_in_mw": 0.0, + "log_line_len": 0.0, "last_scrape_ts": 0.0, } @@ -33,7 +34,7 @@ def parse_line(line: str) -> dict: if m: updates["ram_used_mb"] = float(m.group(1)) updates["ram_total_mb"] = float(m.group(2)) - m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line) + m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)(?:mW)?/(\\d+)(?:mW)?", line) if m: updates["power_5v_in_mw"] = float(m.group(1)) return updates @@ -66,6 +67,7 @@ class Handler(http.server.BaseHTTPRequestHandler): line = read_latest_line() if line: metrics.update(parse_line(line)) + metrics["log_line_len"] = float(len(line)) metrics["last_scrape_ts"] = time() out = [] label = f'{{node="{NODE_NAME}"}}' From 5f32dff73b372159d49b037a9fed79c5e76a0f87 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:44:00 -0300 Subject: [PATCH 332/416] monitoring: fix tegrastats regexes --- services/monitoring/jetson-tegrastats-exporter.yaml | 2 +- .../monitoring/scripts/jetson_tegrastats_exporter.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 6b0ce37..ba25c9f 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "6" + monitoring.bstein.dev/restart-rev: "7" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index 284d5ce..8b36111 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -21,20 +21,20 @@ BASE_METRICS = { def parse_line(line: str) -> dict: line = line.strip() updates = {} - m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line) + m = re.search(r"GR3D_FREQ\s+(\d+)%", line) if m: updates["gr3d_freq_percent"] = float(m.group(1)) - m = re.search(r"GPU@(\\d+(?:\\.\\d+)?)C", line) + m = re.search(r"GPU@(\d+(?:\.\d+)?)C", line) if m: updates["gpu_temp_c"] = float(m.group(1)) - m = re.search(r"CPU@(\\d+(?:\\.\\d+)?)C", line) + m = re.search(r"CPU@(\d+(?:\.\d+)?)C", line) if m: updates["cpu_temp_c"] = float(m.group(1)) - m = re.search(r"RAM\\s+(\\d+)/(\\d+)MB", line) + m = re.search(r"RAM\s+(\d+)/(\d+)MB", line) if m: updates["ram_used_mb"] = float(m.group(1)) updates["ram_total_mb"] = float(m.group(2)) - m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)(?:mW)?/(\\d+)(?:mW)?", line) + m = re.search(r"(?:POM_5V_IN|VDD_IN)\s+(\d+)(?:mW)?/(\d+)(?:mW)?", line) if m: updates["power_5v_in_mw"] = float(m.group(1)) return updates From 1b04e6cb0095b8002db0d4b844377b5245bb8e6c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 17:51:13 -0300 Subject: [PATCH 333/416] monitoring: fix gpu idle share --- scripts/dashboards_render_atlas.py | 2 +- services/monitoring/dashboards/atlas-gpu.json | 2 +- services/monitoring/dashboards/atlas-overview.json | 2 +- services/monitoring/grafana-dashboard-gpu.yaml | 2 +- services/monitoring/grafana-dashboard-overview.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 6ad4321..34ded89 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -266,7 +266,7 @@ def namespace_gpu_share_expr(scope_var): usage = namespace_gpu_usage_instant(scope_var) total = f"(sum({usage}) or on() vector(0))" share = f"100 * ({usage}) / clamp_min({total}, 1)" - idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)" + idle = 'label_replace(vector(100), "namespace", "idle", "", "") * on() (' + total + " == bool 0)" return f"({share}) or ({idle})" diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 36ab9e5..f6801aa 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 04352f9..1a507ec 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1901,7 +1901,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index bb395db..dc1025b 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 9495647..ed63da0 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1910,7 +1910,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } From e87fa4369ced5b155e5fbfa940cf8f0fa22497f5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:08:19 -0300 Subject: [PATCH 334/416] atlasbot: make cluster answers more narrative --- services/comms/scripts/atlasbot/bot.py | 196 +++++++++++++++++++++---- 1 file changed, 165 insertions(+), 31 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index d36844b..0dcfc60 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -181,6 +181,27 @@ CLUSTER_HINT_WORDS = { "arm64", } +_INSIGHT_HINT_WORDS = { + "interesting", + "unconventional", + "surprising", + "weird", + "odd", + "fun", + "cool", + "unique", + "notable", +} + +_OVERVIEW_HINT_WORDS = { + "overview", + "summary", + "describe", + "explain", + "tell me about", + "what do you know", +} + _OLLAMA_LOCK = threading.Lock() HARDWARE_HINTS = { @@ -1408,7 +1429,18 @@ def _nodes_summary_line(inventory: list[dict[str, Any]], snapshot: dict[str, Any not_ready = len([n for n in inventory if n.get("ready") is False]) if total is None: return "" - return f"Atlas cluster has {total} nodes ({ready} ready, {not_ready} not ready)." + if not_ready: + names = [] + summary_names = summary.get("not_ready_names") if isinstance(summary, dict) else [] + if isinstance(summary_names, list): + names = [name for name in summary_names if isinstance(name, str)] + if not names and snapshot: + details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else [] + names = [node.get("name") for node in details if isinstance(node, dict) and node.get("ready") is False] + names = [name for name in names if isinstance(name, str) and name] + suffix = f" (not ready: {', '.join(names)})" if names else "" + return f"Atlas has {total} nodes; {ready} ready, {not_ready} not ready{suffix}." + return f"Atlas has {total} nodes and all are Ready." def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str: @@ -1422,7 +1454,7 @@ def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str: parts.append(f"{key}={len(nodes)}") if not parts: return "" - return "Hardware mix: " + ", ".join(parts) + "." + return "Hardware mix includes " + ", ".join(parts) + "." def _os_mix_line(snapshot: dict[str, Any] | None) -> str: @@ -1449,6 +1481,8 @@ def _pods_summary_line(metrics: dict[str, Any]) -> str: pending = metrics.get("pods_pending") failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") + if running is None and pending is None and failed is None and succeeded is None: + return "" parts: list[str] = [] if running is not None: parts.append(f"{running:.0f} running") @@ -1458,9 +1492,7 @@ def _pods_summary_line(metrics: dict[str, Any]) -> str: parts.append(f"{failed:.0f} failed") if succeeded is not None: parts.append(f"{succeeded:.0f} succeeded") - if not parts: - return "" - return "Pods: " + ", ".join(parts) + "." + return "There are " + ", ".join(parts) + " pods." def _postgres_summary_line(metrics: dict[str, Any]) -> str: @@ -1481,7 +1513,7 @@ def _postgres_summary_line(metrics: dict[str, Any]) -> str: parts.append(f"hottest {hottest.get('label')} ({hot_val_str})") if not parts: return "" - return "Postgres: " + ", ".join(parts) + "." + return "Postgres is at " + ", ".join(parts) + "." def _hottest_summary_line(metrics: dict[str, Any]) -> str: @@ -1504,7 +1536,101 @@ def _hottest_summary_line(metrics: dict[str, Any]) -> str: parts.append(f"{key.upper()} {node} ({value_fmt})") if not parts: return "" - return "Hottest nodes: " + "; ".join(parts) + "." + return "Hot spots: " + "; ".join(parts) + "." + + +def _is_insight_query(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + if any(word in q for word in _INSIGHT_HINT_WORDS): + return True + if "most" in q and any(word in q for word in ("unusual", "odd", "weird", "unconventional")): + return True + return False + + +def _is_overview_query(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + return any(word in q for word in _OVERVIEW_HINT_WORDS) + + +def _doc_intent(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + return any( + phrase in q + for phrase in ( + "runbook", + "documentation", + "docs", + "guide", + "how do i", + "how to", + "instructions", + "playbook", + ) + ) + + +def _insight_candidates( + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, +) -> list[tuple[str, str, str]]: + metrics = _snapshot_metrics(snapshot) + candidates: list[tuple[str, str, str]] = [] + + nodes_line = _nodes_summary_line(inventory, snapshot) + if nodes_line and "not ready" in nodes_line.lower(): + candidates.append(("availability", nodes_line, "high")) + + hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + if hottest: + cpu = hottest.get("cpu") if isinstance(hottest.get("cpu"), dict) else {} + if cpu.get("node") and cpu.get("value") is not None: + value_fmt = _format_metric_value(str(cpu.get("value")), percent=True) + candidates.append(("cpu", f"The busiest CPU right now is {cpu.get('node')} at about {value_fmt}.", "high")) + ram = hottest.get("ram") if isinstance(hottest.get("ram"), dict) else {} + if ram.get("node") and ram.get("value") is not None: + value_fmt = _format_metric_value(str(ram.get("value")), percent=True) + candidates.append(("ram", f"RAM usage peaks on {ram.get('node')} at about {value_fmt}.", "high")) + + postgres_line = _postgres_summary_line(metrics) + if postgres_line: + candidates.append(("postgres", postgres_line, "high")) + + hardware_line = _hardware_mix_line(inventory) + if hardware_line: + candidates.append(("hardware", hardware_line, "medium")) + + pods_line = _pods_summary_line(metrics) + if pods_line: + candidates.append(("pods", pods_line, "high")) + + return candidates + + +def _select_insight( + prompt: str, + candidates: list[tuple[str, str, str]], +) -> tuple[str, str] | None: + if not candidates: + return None + q = normalize_query(prompt) + prefer_keys: list[str] = [] + if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): + prefer_keys.extend(["hardware", "availability"]) + if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: + return candidates[1][1], candidates[1][2] + if prefer_keys: + for key, text, conf in candidates: + if key in prefer_keys: + return text, conf + key, text, conf = candidates[0] + return text, conf def cluster_overview_answer( @@ -1517,31 +1643,21 @@ def cluster_overview_answer( return "" q = normalize_query(prompt) metrics = _snapshot_metrics(snapshot) - lines: list[str] = [] + sentences: list[str] = [] nodes_line = _nodes_summary_line(inventory, snapshot) if nodes_line: - lines.append(nodes_line) + sentences.append(nodes_line) - if any(word in q for word in ("hardware", "architecture", "nodes", "node", "cluster", "atlas", "titan", "lab")): - hw_line = _hardware_mix_line(inventory) - if hw_line: - lines.append(hw_line) - os_line = _os_mix_line(snapshot) - if os_line: - lines.append(os_line) - - if any( + wants_overview = _is_overview_query(q) or any(word in q for word in ("atlas", "cluster", "titan", "lab")) + wants_hardware = any(word in q for word in ("hardware", "architecture", "nodes", "node")) or wants_overview + wants_metrics = any( word in q for word in ( - "interesting", "status", "health", "overview", "summary", - "tell me", - "what do you know", - "about", "pods", "postgres", "connections", @@ -1558,20 +1674,32 @@ def cluster_overview_answer( "usage", "utilization", ) - ): + ) or wants_overview + + if wants_hardware: + hw_line = _hardware_mix_line(inventory) + if hw_line: + sentences.append(hw_line) + os_line = _os_mix_line(snapshot) + if os_line: + sentences.append(os_line) + + if wants_metrics: pods_line = _pods_summary_line(metrics) if pods_line: - lines.append(pods_line) - hottest_line = _hottest_summary_line(metrics) - if hottest_line: - lines.append(hottest_line) + sentences.append(pods_line) postgres_line = _postgres_summary_line(metrics) if postgres_line: - lines.append(postgres_line) + sentences.append(postgres_line) + hottest_line = _hottest_summary_line(metrics) + if hottest_line: + sentences.append(hottest_line) - if not lines: + if not sentences: return "" - return "Based on the snapshot, " + "\n".join(lines) + if len(sentences) > 3 and not wants_overview: + sentences = sentences[:3] + return "Based on the latest snapshot, " + " ".join(sentences) def cluster_answer( @@ -1582,6 +1710,12 @@ def cluster_answer( workloads: list[dict[str, Any]] | None, ) -> str: metrics_summary = snapshot_context(prompt, snapshot) + if _is_insight_query(prompt): + candidates = _insight_candidates(inventory, snapshot) + selected = _select_insight(prompt, candidates) + if selected: + text, confidence = selected + return _format_confidence(text, confidence) structured = structured_answer( prompt, inventory=inventory, @@ -1602,7 +1736,7 @@ def cluster_answer( overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot) if overview: - kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else "" + kb_titles = kb_retrieve_titles(prompt, limit=4) if _doc_intent(prompt) else "" if kb_titles: overview = overview + "\n" + kb_titles return _format_confidence(overview, "medium") From b04092b63c8c59793cdcb5ec609e1be35830bb77 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:10:30 -0300 Subject: [PATCH 335/416] comms: roll atlasbot after bot updates --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 301a474..817e936 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-51 + checksum/atlasbot-configmap: manual-atlasbot-52 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From ad01659cc4a012c76d05936fe37086676c69662b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 21:11:24 +0000 Subject: [PATCH 336/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index bb9e5f0..68eea2c 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 6fead623fa9fd3d7c909f6fd2e2bc11a3aafd7df Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 21:11:27 +0000 Subject: [PATCH 337/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 68eea2c..a813241 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 0a10a2d861a1114642785b45f8ac2fcc9b58b932 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:17:29 -0300 Subject: [PATCH 338/416] atlasbot: add narrative insights --- services/comms/scripts/atlasbot/bot.py | 50 ++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 0dcfc60..ada8dd7 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1616,7 +1616,7 @@ def _insight_candidates( def _select_insight( prompt: str, candidates: list[tuple[str, str, str]], -) -> tuple[str, str] | None: +) -> tuple[str, str, str] | None: if not candidates: return None q = normalize_query(prompt) @@ -1624,13 +1624,43 @@ def _select_insight( if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): prefer_keys.extend(["hardware", "availability"]) if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: - return candidates[1][1], candidates[1][2] + return candidates[1] if prefer_keys: for key, text, conf in candidates: if key in prefer_keys: - return text, conf - key, text, conf = candidates[0] - return text, conf + return key, text, conf + return candidates[0] + + +def _format_insight_text(key: str, text: str) -> str: + cleaned = text.strip().rstrip(".") + if not cleaned: + return "" + if key == "hardware": + counts = cleaned.replace("Hardware mix includes ", "") + return f"Atlas mixes Raspberry Pi, Jetson, and AMD64 nodes ({counts})." + if key == "postgres": + detail = cleaned.replace("Postgres is at ", "") + return f"Postgres looks healthy at {detail}." + if key == "pods": + detail = cleaned.replace("There are ", "") + return f"Pods look stable with {detail}." + if key == "availability": + return cleaned + "." + if key in ("cpu", "ram"): + return cleaned + "." + return cleaned + "." + + +def _insight_prefix(prompt: str) -> str: + q = normalize_query(prompt) + if any(word in q for word in ("another", "else", "different", "other")): + return "Another interesting detail: " + if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): + return "What stands out is that " + if any(word in q for word in ("interesting", "notable", "fun", "cool")): + return "One notable detail: " + return "" def cluster_overview_answer( @@ -1714,8 +1744,14 @@ def cluster_answer( candidates = _insight_candidates(inventory, snapshot) selected = _select_insight(prompt, candidates) if selected: - text, confidence = selected - return _format_confidence(text, confidence) + key, raw_text, confidence = selected + formatted = _format_insight_text(key, raw_text) + if not formatted: + formatted = raw_text + prefix = _insight_prefix(prompt) + if prefix: + formatted = prefix + formatted + return _format_confidence(formatted, confidence) structured = structured_answer( prompt, inventory=inventory, From e05a949b9ff32e7fc5a5555aa2b58306d6c56c50 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:18:06 -0300 Subject: [PATCH 339/416] comms: roll atlasbot for insight updates --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 817e936..31e3733 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-52 + checksum/atlasbot-configmap: manual-atlasbot-53 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 113bcdededcc6687eb88481fcb337f09e8b9bb02 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:32:27 -0300 Subject: [PATCH 340/416] atlasbot: use history for subjective follow-ups --- services/comms/scripts/atlasbot/bot.py | 94 ++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 5 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index ada8dd7..a446a10 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -191,6 +191,10 @@ _INSIGHT_HINT_WORDS = { "cool", "unique", "notable", + "coolest", + "favorite", + "favourite", + "trivia", } _OVERVIEW_HINT_WORDS = { @@ -1550,6 +1554,21 @@ def _is_insight_query(query: str) -> bool: return False +def _is_subjective_query(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + return any(word in q for word in _INSIGHT_HINT_WORDS) or any( + phrase in q + for phrase in ( + "what do you think", + "your favorite", + "your favourite", + "your opinion", + ) + ) + + def _is_overview_query(query: str) -> bool: q = normalize_query(query) if not q: @@ -1602,9 +1621,9 @@ def _insight_candidates( if postgres_line: candidates.append(("postgres", postgres_line, "high")) - hardware_line = _hardware_mix_line(inventory) - if hardware_line: - candidates.append(("hardware", hardware_line, "medium")) + hardware_insight = _hardware_insight(inventory) + if hardware_insight: + candidates.append(("hardware", hardware_insight, "medium")) pods_line = _pods_summary_line(metrics) if pods_line: @@ -1613,6 +1632,29 @@ def _insight_candidates( return candidates +def _hardware_insight(inventory: list[dict[str, Any]]) -> str: + if not inventory: + return "" + groups = _group_nodes(inventory) + jetsons = groups.get("jetson") or [] + rpi5 = groups.get("rpi5") or [] + rpi4 = groups.get("rpi4") or [] + amd64 = groups.get("amd64") or [] + if jetsons: + jetson_names = ", ".join(jetsons[:2]) + return ( + f"Atlas mixes tiny Raspberry Pi nodes with Jetson accelerators ({jetson_names}) " + f"and AMD64 servers, which is unusual for a homelab cluster." + ) + if amd64 and (rpi5 or rpi4): + return ( + "Atlas mixes small ARM boards with a couple of AMD64 machines, " + "so workloads can land on either low-power or high-power nodes." + ) + line = _hardware_mix_line(inventory) + return line.replace("Hardware mix includes ", "Atlas mixes ") if line else "" + + def _select_insight( prompt: str, candidates: list[tuple[str, str, str]], @@ -1623,6 +1665,8 @@ def _select_insight( prefer_keys: list[str] = [] if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): prefer_keys.extend(["hardware", "availability"]) + if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")): + prefer_keys.extend(["hardware", "cpu", "ram"]) if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: return candidates[1] if prefer_keys: @@ -2284,7 +2328,24 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) - cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) + history_payload = payload.get("history") or [] + history_lines: list[str] = [] + if isinstance(history_payload, list): + for item in history_payload[-10:]: + if isinstance(item, dict): + content = item.get("content") or item.get("message") or "" + if isinstance(content, str) and content.strip(): + history_lines.append(content.strip()) + elif isinstance(item, str) and item.strip(): + history_lines.append(item.strip()) + history_cluster = _history_mentions_cluster( + history_lines, + inventory=inventory, + workloads=workloads, + ) + cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) or ( + _is_subjective_query(cleaned) and history_cluster + ) context = "" if cluster_query: context = build_context( @@ -2329,6 +2390,22 @@ history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] ( def key_for(room_id: str, sender: str, is_dm: bool): return (room_id, None) if is_dm else (room_id, sender) + +def _history_mentions_cluster( + history_lines: list[str], + *, + inventory: list[dict[str, Any]] | None = None, + workloads: list[dict[str, Any]] | None = None, +) -> bool: + recent = [line for line in history_lines[-8:] if isinstance(line, str)] + for line in recent: + cleaned = normalize_query(line) + if not cleaned: + continue + if _is_cluster_query(cleaned, inventory=inventory, workloads=workloads): + return True + return False + def build_context( prompt: str, *, @@ -2734,7 +2811,14 @@ def sync_loop(token: str, room_id: str): if not inventory: inventory = _snapshot_inventory(snapshot) workloads = _snapshot_workloads(snapshot) - cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) + history_cluster = _history_mentions_cluster( + history[hist_key], + inventory=inventory, + workloads=workloads, + ) + cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) or ( + _is_subjective_query(cleaned_body) and history_cluster + ) context = "" if cluster_query: context = build_context( From 58dab1ca797578e02ac05b2513ff94d5ac3fae6a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:32:54 -0300 Subject: [PATCH 341/416] comms: roll atlasbot after history update --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 31e3733..03e9dc2 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-53 + checksum/atlasbot-configmap: manual-atlasbot-54 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 4e6d4f43b2551f7175179f4cfc6da6aa40fb1c89 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:43:03 -0300 Subject: [PATCH 342/416] atlasbot: improve insight voice and avoid repeats --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 86 ++++++++++++++++++++----- 2 files changed, 70 insertions(+), 18 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 03e9dc2..dc1b0bb 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-54 + checksum/atlasbot-configmap: manual-atlasbot-55 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index a446a10..2616cb1 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1640,27 +1640,49 @@ def _hardware_insight(inventory: list[dict[str, Any]]) -> str: rpi5 = groups.get("rpi5") or [] rpi4 = groups.get("rpi4") or [] amd64 = groups.get("amd64") or [] + parts: list[str] = [] + if rpi5: + parts.append(f"rpi5={len(rpi5)}") + if rpi4: + parts.append(f"rpi4={len(rpi4)}") if jetsons: jetson_names = ", ".join(jetsons[:2]) - return ( - f"Atlas mixes tiny Raspberry Pi nodes with Jetson accelerators ({jetson_names}) " - f"and AMD64 servers, which is unusual for a homelab cluster." - ) - if amd64 and (rpi5 or rpi4): - return ( - "Atlas mixes small ARM boards with a couple of AMD64 machines, " - "so workloads can land on either low-power or high-power nodes." - ) - line = _hardware_mix_line(inventory) - return line.replace("Hardware mix includes ", "Atlas mixes ") if line else "" + parts.append(f"jetson={len(jetsons)} ({jetson_names})") + if amd64: + parts.append(f"amd64={len(amd64)}") + return ", ".join(parts) + + +def _recent_insight_keys(history_lines: list[str]) -> set[str]: + used: set[str] = set() + for line in history_lines[-10:]: + lower = normalize_query(line) + if not lower: + continue + if "postgres" in lower or "connections" in lower: + used.add("postgres") + if "atlas mixes" in lower or "hardware" in lower or "rpi" in lower or "jetson" in lower: + used.add("hardware") + if "busiest cpu" in lower or "cpu right now" in lower or "cpu " in lower: + used.add("cpu") + if "ram usage" in lower or "memory" in lower: + used.add("ram") + if "pods" in lower: + used.add("pods") + if "not ready" in lower: + used.add("availability") + return used def _select_insight( prompt: str, candidates: list[tuple[str, str, str]], + *, + used_keys: set[str] | None = None, ) -> tuple[str, str, str] | None: if not candidates: return None + used = used_keys or set() q = normalize_query(prompt) prefer_keys: list[str] = [] if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): @@ -1668,11 +1690,21 @@ def _select_insight( if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")): prefer_keys.extend(["hardware", "cpu", "ram"]) if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: + for candidate in candidates: + if candidate[0] not in used: + return candidate return candidates[1] if prefer_keys: + for key, text, conf in candidates: + if key in prefer_keys and key not in used: + return key, text, conf for key, text, conf in candidates: if key in prefer_keys: return key, text, conf + if used: + for candidate in candidates: + if candidate[0] not in used: + return candidate return candidates[0] @@ -1681,29 +1713,45 @@ def _format_insight_text(key: str, text: str) -> str: if not cleaned: return "" if key == "hardware": - counts = cleaned.replace("Hardware mix includes ", "") - return f"Atlas mixes Raspberry Pi, Jetson, and AMD64 nodes ({counts})." + counts = ( + cleaned.replace("Hardware mix includes ", "") + .replace("Atlas mixes tiny ", "") + .replace("Atlas mixes ", "") + .replace("which is unusual for a homelab cluster", "") + .strip() + .strip(".") + ) + return f"the mixed hardware stack ({counts}) is a bit unconventional for a homelab." if key == "postgres": detail = cleaned.replace("Postgres is at ", "") - return f"Postgres looks healthy at {detail}." + return f"Postgres looks healthy at {detail}; that suggests moderate load." if key == "pods": detail = cleaned.replace("There are ", "") return f"Pods look stable with {detail}." if key == "availability": return cleaned + "." if key in ("cpu", "ram"): - return cleaned + "." + suffix = " That likely marks the busiest workload right now." if key == "cpu" else " That box is carrying the heaviest memory load." + return cleaned + "." + suffix return cleaned + "." def _insight_prefix(prompt: str) -> str: q = normalize_query(prompt) + if "coolest" in q: + return "If I had to pick the coolest detail, it's " + if "favorite" in q or "favourite" in q: + return "My favorite detail is " + if "trivia" in q: + return "A bit of trivia I like: " + if "most interesting" in q: + return "The most interesting detail to me is " if any(word in q for word in ("another", "else", "different", "other")): return "Another interesting detail: " if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): return "What stands out is that " if any(word in q for word in ("interesting", "notable", "fun", "cool")): - return "One notable detail: " + return "One thing I'd highlight is " return "" @@ -1782,11 +1830,13 @@ def cluster_answer( inventory: list[dict[str, Any]], snapshot: dict[str, Any] | None, workloads: list[dict[str, Any]] | None, + history_lines: list[str] | None = None, ) -> str: metrics_summary = snapshot_context(prompt, snapshot) if _is_insight_query(prompt): candidates = _insight_candidates(inventory, snapshot) - selected = _select_insight(prompt, candidates) + used_keys = _recent_insight_keys(history_lines or []) + selected = _select_insight(prompt, candidates, used_keys=used_keys) if selected: key, raw_text, confidence = selected formatted = _format_insight_text(key, raw_text) @@ -2363,6 +2413,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): inventory=inventory, snapshot=snapshot, workloads=workloads, + history_lines=history_lines, ) if not answer: answer = fallback @@ -2843,6 +2894,7 @@ def sync_loop(token: str, room_id: str): inventory=inventory, snapshot=snapshot, workloads=workloads, + history_lines=history[hist_key], ) if not reply: reply = fallback From 51bf01a8fd16f41c714aa5725fa3105167742df8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:44:58 -0300 Subject: [PATCH 343/416] monitoring: keep idle label in gpu share --- scripts/dashboards_render_atlas.py | 2 +- services/monitoring/dashboards/atlas-gpu.json | 2 +- services/monitoring/dashboards/atlas-overview.json | 2 +- services/monitoring/grafana-dashboard-gpu.yaml | 2 +- services/monitoring/grafana-dashboard-overview.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 34ded89..445de94 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -266,7 +266,7 @@ def namespace_gpu_share_expr(scope_var): usage = namespace_gpu_usage_instant(scope_var) total = f"(sum({usage}) or on() vector(0))" share = f"100 * ({usage}) / clamp_min({total}, 1)" - idle = 'label_replace(vector(100), "namespace", "idle", "", "") * on() (' + total + " == bool 0)" + idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)" return f"({share}) or ({idle})" diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index f6801aa..132f276 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 1a507ec..b212c8c 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1901,7 +1901,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index dc1025b..55f63e8 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index ed63da0..a899002 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1910,7 +1910,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } From 4e51cf6b6caafccdd8a363ef66d600cc64d39f58 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:45:49 -0300 Subject: [PATCH 344/416] atlasbot: tighten insight phrasing --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index dc1b0bb..4a3949d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-55 + checksum/atlasbot-configmap: manual-atlasbot-56 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 2616cb1..9beff7f 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1721,17 +1721,21 @@ def _format_insight_text(key: str, text: str) -> str: .strip() .strip(".") ) - return f"the mixed hardware stack ({counts}) is a bit unconventional for a homelab." + return f"mixed hardware stack ({counts}), which is unusual for a homelab." if key == "postgres": detail = cleaned.replace("Postgres is at ", "") - return f"Postgres looks healthy at {detail}; that suggests moderate load." + return f"Postgres is at {detail}; that suggests moderate load." if key == "pods": detail = cleaned.replace("There are ", "") return f"Pods look stable with {detail}." if key == "availability": return cleaned + "." if key in ("cpu", "ram"): - suffix = " That likely marks the busiest workload right now." if key == "cpu" else " That box is carrying the heaviest memory load." + suffix = ( + " That likely marks the busiest workload right now." + if key == "cpu" + else " That box is carrying the heaviest memory load." + ) return cleaned + "." + suffix return cleaned + "." @@ -1739,19 +1743,19 @@ def _format_insight_text(key: str, text: str) -> str: def _insight_prefix(prompt: str) -> str: q = normalize_query(prompt) if "coolest" in q: - return "If I had to pick the coolest detail, it's " + return "If I had to pick the coolest detail: " if "favorite" in q or "favourite" in q: - return "My favorite detail is " + return "My favorite detail: " if "trivia" in q: return "A bit of trivia I like: " if "most interesting" in q: - return "The most interesting detail to me is " + return "The most interesting detail to me: " if any(word in q for word in ("another", "else", "different", "other")): return "Another interesting detail: " if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): return "What stands out is that " if any(word in q for word in ("interesting", "notable", "fun", "cool")): - return "One thing I'd highlight is " + return "One thing I'd highlight: " return "" From c4ad82f122743d9cc2f5d5d5086f61e7041ddc44 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:48:35 -0300 Subject: [PATCH 345/416] atlasbot: add more opinionated hardware insight --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4a3949d..d02255e 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-56 + checksum/atlasbot-configmap: manual-atlasbot-57 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 9beff7f..54434e7 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1721,7 +1721,9 @@ def _format_insight_text(key: str, text: str) -> str: .strip() .strip(".") ) - return f"mixed hardware stack ({counts}), which is unusual for a homelab." + detail = f"mixed hardware stack ({counts})" + flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 nodes." + return f"{detail}. {flavor}" if key == "postgres": detail = cleaned.replace("Postgres is at ", "") return f"Postgres is at {detail}; that suggests moderate load." @@ -1732,9 +1734,9 @@ def _format_insight_text(key: str, text: str) -> str: return cleaned + "." if key in ("cpu", "ram"): suffix = ( - " That likely marks the busiest workload right now." + " If you're chasing hotspots, that's the busiest workload right now." if key == "cpu" - else " That box is carrying the heaviest memory load." + else " That box is carrying the heaviest memory load right now." ) return cleaned + "." + suffix return cleaned + "." From 79650616f181043797eb2b34bdb6c0c332165638 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:51:00 -0300 Subject: [PATCH 346/416] atlasbot: make insights sound more human --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d02255e..2c0b84d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-57 + checksum/atlasbot-configmap: manual-atlasbot-58 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 54434e7..659ea49 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1729,9 +1729,9 @@ def _format_insight_text(key: str, text: str) -> str: return f"Postgres is at {detail}; that suggests moderate load." if key == "pods": detail = cleaned.replace("There are ", "") - return f"Pods look stable with {detail}." + return f"Pods look steady ({detail}); the workload mix looks healthy." if key == "availability": - return cleaned + "." + return cleaned + " That suggests the cluster is stable right now." if key in ("cpu", "ram"): suffix = ( " If you're chasing hotspots, that's the busiest workload right now." From 69d121aa0743b3b1319ddc14a2834a91bfdefa25 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:54:05 -0300 Subject: [PATCH 347/416] atlasbot: use hottest node labels for insights --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 2c0b84d..1212505 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-58 + checksum/atlasbot-configmap: manual-atlasbot-59 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 659ea49..7f92d8e 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1608,14 +1608,26 @@ def _insight_candidates( hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} if hottest: + def _hot_node(entry: dict[str, Any]) -> str: + if not isinstance(entry, dict): + return "" + return ( + entry.get("node") + or entry.get("label") + or (entry.get("metric") or {}).get("node") + or "" + ) + cpu = hottest.get("cpu") if isinstance(hottest.get("cpu"), dict) else {} - if cpu.get("node") and cpu.get("value") is not None: + cpu_node = _hot_node(cpu) + if cpu_node and cpu.get("value") is not None: value_fmt = _format_metric_value(str(cpu.get("value")), percent=True) - candidates.append(("cpu", f"The busiest CPU right now is {cpu.get('node')} at about {value_fmt}.", "high")) + candidates.append(("cpu", f"The busiest CPU right now is {cpu_node} at about {value_fmt}.", "high")) ram = hottest.get("ram") if isinstance(hottest.get("ram"), dict) else {} - if ram.get("node") and ram.get("value") is not None: + ram_node = _hot_node(ram) + if ram_node and ram.get("value") is not None: value_fmt = _format_metric_value(str(ram.get("value")), percent=True) - candidates.append(("ram", f"RAM usage peaks on {ram.get('node')} at about {value_fmt}.", "high")) + candidates.append(("ram", f"RAM usage peaks on {ram_node} at about {value_fmt}.", "high")) postgres_line = _postgres_summary_line(metrics) if postgres_line: From 8bd4d9fc7af48a84915c83fc9e3a81ffdd2fb207 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:56:14 -0300 Subject: [PATCH 348/416] atlasbot: prioritize hardware for subjective prompts --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 1212505..cbc79e5 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-59 + checksum/atlasbot-configmap: manual-atlasbot-60 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7f92d8e..613b0c6 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1707,12 +1707,14 @@ def _select_insight( return candidate return candidates[1] if prefer_keys: - for key, text, conf in candidates: - if key in prefer_keys and key not in used: - return key, text, conf - for key, text, conf in candidates: - if key in prefer_keys: - return key, text, conf + for prefer in prefer_keys: + for key, text, conf in candidates: + if key == prefer and key not in used: + return key, text, conf + for prefer in prefer_keys: + for key, text, conf in candidates: + if key == prefer: + return key, text, conf if used: for candidate in candidates: if candidate[0] not in used: From 4bab34eae10fb7cba41262cc6e02ddeb3bf66388 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:58:59 -0300 Subject: [PATCH 349/416] atlasbot: keep coolest answers opinionated --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index cbc79e5..ef6b88b 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-60 + checksum/atlasbot-configmap: manual-atlasbot-61 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 613b0c6..9434e91 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1701,6 +1701,7 @@ def _select_insight( prefer_keys.extend(["hardware", "availability"]) if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")): prefer_keys.extend(["hardware", "cpu", "ram"]) + avoid_used = any(word in q for word in ("another", "else", "different", "other")) or "most interesting" in q if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: for candidate in candidates: if candidate[0] not in used: @@ -1709,13 +1710,13 @@ def _select_insight( if prefer_keys: for prefer in prefer_keys: for key, text, conf in candidates: - if key == prefer and key not in used: + if key == prefer and (not avoid_used or key not in used): return key, text, conf for prefer in prefer_keys: for key, text, conf in candidates: if key == prefer: return key, text, conf - if used: + if used and avoid_used: for candidate in candidates: if candidate[0] not in used: return candidate From 243d3112ce353b7db5ae9891d2176dbb0cb08808 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:01:16 -0300 Subject: [PATCH 350/416] atlasbot: prefer hardware for general interest --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index ef6b88b..e8e22a3 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-61 + checksum/atlasbot-configmap: manual-atlasbot-62 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 9434e91..f9e6b81 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1701,6 +1701,8 @@ def _select_insight( prefer_keys.extend(["hardware", "availability"]) if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")): prefer_keys.extend(["hardware", "cpu", "ram"]) + if "interesting" in q and "most interesting" not in q: + prefer_keys.extend(["hardware", "postgres", "cpu", "ram"]) avoid_used = any(word in q for word in ("another", "else", "different", "other")) or "most interesting" in q if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: for candidate in candidates: From ea8eda2c73a52d63c12ea59e1a83018b3ca693bd Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:04:29 -0300 Subject: [PATCH 351/416] atlasbot: treat hardware prompts as cluster queries --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index e8e22a3..36bb1db 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-62 + checksum/atlasbot-configmap: manual-atlasbot-63 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f9e6b81..4ca3b2e 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -139,6 +139,8 @@ CLUSTER_HINT_WORDS = { "kubernetes", "node", "nodes", + "hardware", + "architecture", "worker", "workers", "pod", From 9bf822ec36f4150013da1e8439d91a41f3ba4cc5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:06:44 -0300 Subject: [PATCH 352/416] atlasbot: answer hardware mix queries --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 36bb1db..9cc0a1e 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-63 + checksum/atlasbot-configmap: manual-atlasbot-64 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 4ca3b2e..570bc26 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1292,6 +1292,11 @@ def structured_answer( if not op and entity == "node": op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count" + if entity == "node" and ("hardware mix" in q or "architecture" in q): + hw_line = _hardware_mix_line(inventory) + if hw_line: + return _format_confidence(hw_line, "high") + if op == "top" and metric is None: metric = "cpu" From 12fa7d02aaad250c1fd29ccb51b177dd3eff1138 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:10:02 -0300 Subject: [PATCH 353/416] atlasbot: expand hardware and entity detection --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 9cc0a1e..72503b8 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-64 + checksum/atlasbot-configmap: manual-atlasbot-65 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 570bc26..2b3657a 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -197,6 +197,8 @@ _INSIGHT_HINT_WORDS = { "favorite", "favourite", "trivia", + "stand out", + "stands out", } _OVERVIEW_HINT_WORDS = { @@ -213,8 +215,8 @@ _OLLAMA_LOCK = threading.Lock() HARDWARE_HINTS = { "amd64": ("amd64", "x86", "x86_64", "x86-64"), "jetson": ("jetson",), - "rpi4": ("rpi4",), - "rpi5": ("rpi5",), + "rpi4": ("rpi4", "raspberry pi 4", "raspberry pi-4"), + "rpi5": ("rpi5", "raspberry pi 5", "raspberry pi-5"), "rpi": ("rpi", "raspberry"), "arm64": ("arm64", "aarch64"), } @@ -559,7 +561,16 @@ def _detect_role_filters(q: str) -> set[str]: return roles def _detect_entity(q: str) -> str | None: - if "node" in q or "nodes" in q or "worker" in q or "hardware" in q or "architecture" in q or TITAN_NODE_RE.search(q): + if ( + "node" in q + or "nodes" in q + or "worker" in q + or "hardware" in q + or "architecture" in q + or "machine" in q + or "machines" in q + or TITAN_NODE_RE.search(q) + ): return "node" if "pod" in q or "pods" in q: return "pod" From 88426622395c25dfb033d1d01ef40c80f561e795 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:13:31 -0300 Subject: [PATCH 354/416] atlasbot: refine node and postgres query handling --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 72503b8..e1ff2bb 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-65 + checksum/atlasbot-configmap: manual-atlasbot-66 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 2b3657a..abdcbf2 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -538,7 +538,17 @@ def _detect_metric(q: str) -> str | None: def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: include: set[str] = set() exclude: set[str] = set() - rpi_specific = "rpi4" in q or "rpi5" in q + rpi_specific = any( + phrase in q + for phrase in ( + "rpi4", + "rpi5", + "raspberry pi 4", + "raspberry pi 5", + "raspberry pi-4", + "raspberry pi-5", + ) + ) for hardware, phrases in HARDWARE_HINTS.items(): if hardware == "rpi" and rpi_specific: continue @@ -1226,7 +1236,11 @@ def snapshot_metric_answer( hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} parts: list[str] = [] if used is not None and max_conn is not None: - parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.") + free = max_conn - used + if any(word in q for word in ("free", "available", "remaining")): + parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max ({free:.0f} free).") + else: + parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.") if hottest.get("label"): hot_val = hottest.get("value") hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else "" @@ -1303,6 +1317,11 @@ def structured_answer( if not op and entity == "node": op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count" + if entity == "node" and "total" in q and "ready" in q: + summary = _nodes_summary_line(inventory, snapshot) + if summary: + return _format_confidence(summary, "high") + if entity == "node" and ("hardware mix" in q or "architecture" in q): hw_line = _hardware_mix_line(inventory) if hw_line: From 20364a262cdc2c0209575dca028dc93c46865e87 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:37:20 -0300 Subject: [PATCH 355/416] atlasbot: strengthen subjective insights --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 58 +++++++++++++++++-------- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index e1ff2bb..4ac3582 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-66 + checksum/atlasbot-configmap: manual-atlasbot-67 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index abdcbf2..0d0f92b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -579,6 +579,10 @@ def _detect_entity(q: str) -> str | None: or "architecture" in q or "machine" in q or "machines" in q + or "host" in q + or "hosts" in q + or "hostname" in q + or "hostnames" in q or TITAN_NODE_RE.search(q) ): return "node" @@ -1775,20 +1779,29 @@ def _format_insight_text(key: str, text: str) -> str: .strip() .strip(".") ) + has_jetson = "jetson=" in counts + has_amd64 = "amd64=" in counts detail = f"mixed hardware stack ({counts})" - flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 nodes." + if has_jetson and has_amd64: + flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 boxes." + elif has_jetson: + flavor = "It pairs low-power Pis with Jetson accelerators for edge and AI workloads." + elif has_amd64: + flavor = "It mixes low-power Pis with a couple of heavier AMD64 nodes." + else: + flavor = "It is a pretty uniform hardware stack, which is rare for a homelab." return f"{detail}. {flavor}" if key == "postgres": detail = cleaned.replace("Postgres is at ", "") - return f"Postgres is at {detail}; that suggests moderate load." + return f"Postgres is at {detail}; that feels like healthy, steady load rather than strain." if key == "pods": detail = cleaned.replace("There are ", "") - return f"Pods look steady ({detail}); the workload mix looks healthy." + return f"Pods look steady ({detail}); nothing looks stuck or unhealthy." if key == "availability": - return cleaned + " That suggests the cluster is stable right now." + return cleaned + " That is the kind of stability I like to see." if key in ("cpu", "ram"): suffix = ( - " If you're chasing hotspots, that's the busiest workload right now." + " If you're chasing hotspots, that's the node I'd watch first." if key == "cpu" else " That box is carrying the heaviest memory load right now." ) @@ -1799,19 +1812,19 @@ def _format_insight_text(key: str, text: str) -> str: def _insight_prefix(prompt: str) -> str: q = normalize_query(prompt) if "coolest" in q: - return "If I had to pick the coolest detail: " + return "If I had to pick the coolest detail, I'd say " if "favorite" in q or "favourite" in q: - return "My favorite detail: " + return "My favorite detail is " if "trivia" in q: return "A bit of trivia I like: " if "most interesting" in q: - return "The most interesting detail to me: " + return "The most interesting detail to me is " if any(word in q for word in ("another", "else", "different", "other")): return "Another interesting detail: " if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): - return "What stands out is that " + return "What stands out to me is that " if any(word in q for word in ("interesting", "notable", "fun", "cool")): - return "One thing I'd highlight: " + return "One thing I'd call out is " return "" @@ -2389,6 +2402,21 @@ def _normalize_reply(value: Any) -> str: return _ensure_confidence(text) +def _history_payload_lines(history_payload: list[Any]) -> list[str]: + lines: list[str] = [] + if not isinstance(history_payload, list): + return lines + for item in history_payload[-12:]: + if isinstance(item, dict): + for key in ("content", "message", "text", "prompt", "question", "body", "answer", "reply", "response"): + val = item.get(key) + if isinstance(val, str) and val.strip(): + lines.append(val.strip()) + elif isinstance(item, str) and item.strip(): + lines.append(item.strip()) + return [line for line in lines if line] + + # Internal HTTP endpoint for cluster answers (website uses this). class _AtlasbotHandler(BaseHTTPRequestHandler): server_version = "AtlasbotHTTP/1.0" @@ -2439,15 +2467,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) history_payload = payload.get("history") or [] - history_lines: list[str] = [] - if isinstance(history_payload, list): - for item in history_payload[-10:]: - if isinstance(item, dict): - content = item.get("content") or item.get("message") or "" - if isinstance(content, str) and content.strip(): - history_lines.append(content.strip()) - elif isinstance(item, str) and item.strip(): - history_lines.append(item.strip()) + history_lines = _history_payload_lines(history_payload) history_cluster = _history_mentions_cluster( history_lines, inventory=inventory, From 18e543d95a09956967a92dd1f6094b3ce90171be Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:42:04 -0300 Subject: [PATCH 356/416] atlasbot: refine insight tone and status --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4ac3582..609c245 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-67 + checksum/atlasbot-configmap: manual-atlasbot-68 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 0d0f92b..db0f560 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -118,7 +118,7 @@ CONFIDENCE_RE = re.compile(r"confidence\s*:\s*(high|medium|low)\b", re.IGNORECAS OPERATION_HINTS = { "count": ("how many", "count", "number", "total"), "list": ("list", "which", "what are", "show", "names"), - "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum"), + "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum", "busiest", "busy"), "status": ("ready", "not ready", "unready", "down", "missing", "status"), } @@ -1414,6 +1414,11 @@ def structured_answer( names = [node["name"] for node in filtered] if op == "status": + if "missing" in q and ("ready" in q or "readiness" in q): + return _format_confidence( + "Not ready nodes: " + (", ".join(names) if names else "none") + ".", + "high", + ) if "missing" in q and expected_workers: missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) return _format_confidence( From 9e06d7afc8adb2ee0c51f2b3df454fee8b5add42 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 20:02:09 -0300 Subject: [PATCH 357/416] atlasbot: route subjective queries to LLM --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 60 +++++++++++++++++-------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 609c245..d8ce3ee 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-68 + checksum/atlasbot-configmap: manual-atlasbot-69 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index db0f560..141b971 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1911,19 +1911,6 @@ def cluster_answer( history_lines: list[str] | None = None, ) -> str: metrics_summary = snapshot_context(prompt, snapshot) - if _is_insight_query(prompt): - candidates = _insight_candidates(inventory, snapshot) - used_keys = _recent_insight_keys(history_lines or []) - selected = _select_insight(prompt, candidates, used_keys=used_keys) - if selected: - key, raw_text, confidence = selected - formatted = _format_insight_text(key, raw_text) - if not formatted: - formatted = raw_text - prefix = _insight_prefix(prompt) - if prefix: - formatted = prefix + formatted - return _format_confidence(formatted, confidence) structured = structured_answer( prompt, inventory=inventory, @@ -2422,6 +2409,17 @@ def _history_payload_lines(history_payload: list[Any]) -> list[str]: return [line for line in lines if line] +def _append_history_context(context: str, history_lines: list[str]) -> str: + lines = [line.strip() for line in history_lines if isinstance(line, str) and line.strip()] + if not lines: + return context + snippet = "\n".join(lines[-6:]) + combined = context + "\nRecent chat:\n" + snippet if context else "Recent chat:\n" + snippet + if len(combined) > MAX_CONTEXT_CHARS: + combined = combined[: MAX_CONTEXT_CHARS - 3].rstrip() + "..." + return combined + + # Internal HTTP endpoint for cluster answers (website uses this). class _AtlasbotHandler(BaseHTTPRequestHandler): server_version = "AtlasbotHTTP/1.0" @@ -2493,15 +2491,25 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): ) fallback = "I don't have enough data to answer that." if cluster_query: - answer = cluster_answer( + facts_answer = cluster_answer( cleaned, inventory=inventory, snapshot=snapshot, workloads=workloads, history_lines=history_lines, ) - if not answer: - answer = fallback + open_ended = _is_subjective_query(cleaned) or _knowledge_intent(cleaned) + if open_ended: + llm_context = _append_history_context(context, history_lines) + answer = ollama_reply( + ("http", "internal"), + cleaned, + context=llm_context, + fallback=facts_answer or fallback, + use_history=False, + ) + else: + answer = facts_answer or fallback else: llm_prompt = cleaned answer = ollama_reply( @@ -2761,11 +2769,13 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru "When a cluster snapshot is provided, never answer about unrelated meanings of 'Atlas' (maps, mythology, Apache Atlas, etc). " "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. " "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. " + "For subjective prompts (interesting, favorite, unconventional), pick one or two observations from the context, explain why they stand out in 1-2 sentences, and avoid repeating the same observation as the last response if you can. " "Prefer exact repo paths and Kubernetes resource names when relevant. " "Never include or request secret values. " "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " "Translate metrics into natural language instead of echoing raw label/value pairs. " + "Avoid bare lists unless the user asked for a list; weave numbers into sentences. " "Do not answer by only listing runbooks; if the question is about Atlas/Othrys, summarize the cluster first and mention docs only if useful. " "If the question is not about Atlas/Othrys and no cluster context is provided, answer using general knowledge and say when you are unsure. " "If the answer is not grounded in the provided context or tool data, say you do not know. " @@ -2974,15 +2984,27 @@ def sync_loop(token: str, room_id: str): fallback = "I don't have enough data to answer that." if cluster_query: - reply = cluster_answer( + facts_answer = cluster_answer( cleaned_body, inventory=inventory, snapshot=snapshot, workloads=workloads, history_lines=history[hist_key], ) - if not reply: - reply = fallback + open_ended = _is_subjective_query(cleaned_body) or _knowledge_intent(cleaned_body) + if open_ended: + llm_context = _append_history_context(context, history[hist_key]) + reply = ollama_reply_with_thinking( + token, + rid, + hist_key, + cleaned_body, + context=llm_context, + fallback=facts_answer or fallback, + use_history=False, + ) + else: + reply = facts_answer or fallback else: llm_prompt = cleaned_body reply = ollama_reply_with_thinking( From 34c91c6d088687e849f72ed81be984247392f4a9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:02:20 -0300 Subject: [PATCH 358/416] atlasbot: refine open-ended reasoning pipeline --- services/comms/atlasbot-deployment.yaml | 6 +- services/comms/scripts/atlasbot/bot.py | 446 +++++++++++++++++++++--- 2 files changed, 401 insertions(+), 51 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d8ce3ee..cc628dd 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-69 + checksum/atlasbot-configmap: manual-atlasbot-70 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -78,11 +78,11 @@ spec: - name: BOT_USER value: atlasbot - name: BOT_MENTIONS - value: atlasbot,aatlasbot + value: atlasbot,aatlasbot,atlas_quick,atlas_smart - name: OLLAMA_URL value: http://ollama.ai.svc.cluster.local:11434 - name: OLLAMA_MODEL - value: qwen2.5:14b-instruct-q4_0 + value: qwen2.5:14b-instruct - name: OLLAMA_TIMEOUT_SEC value: "600" - name: ATLASBOT_THINKING_INTERVAL_SEC diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 141b971..aa7e614 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -333,6 +333,19 @@ def _strip_bot_mention(text: str) -> str: return cleaned or text.strip() +def _detect_mode_from_body(body: str, *, default: str = "deep") -> str: + lower = normalize_query(body or "") + if "atlas_quick" in lower or "atlas-quick" in lower: + return "fast" + if "atlas_smart" in lower or "atlas-smart" in lower: + return "deep" + if lower.startswith("quick ") or lower.startswith("fast "): + return "fast" + if lower.startswith("smart ") or lower.startswith("deep "): + return "deep" + return default + + # Matrix HTTP helper. def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None): url = (base or BASE) + path @@ -2420,6 +2433,300 @@ def _append_history_context(context: str, history_lines: list[str]) -> str: return combined +class ThoughtState: + def __init__(self, total_steps: int = 0): + self._lock = threading.Lock() + self.stage = "starting" + self.note = "" + self.step = 0 + self.total_steps = total_steps + + def update(self, stage: str, *, note: str = "", step: int | None = None) -> None: + with self._lock: + self.stage = stage + if note: + self.note = note + if step is not None: + self.step = step + + def status_line(self) -> str: + with self._lock: + stage = self.stage + note = self.note + step = self.step + total = self.total_steps + step_part = f"{step}/{total}" if total else str(step) if step else "" + detail = f"Stage {step_part}: {stage}".strip() + if note: + return f"Still thinking ({detail}). Latest insight: {note}" + return f"Still thinking ({detail})." + + +def _ollama_json_call(prompt: str, *, context: str, retries: int = 2) -> dict[str, Any]: + system = ( + "System: You are Atlas, a reasoning assistant. " + "Return strict JSON only (no code fences, no trailing commentary). " + "If you cannot comply, return {}. " + "Only use facts from the provided context. " + "If you make an inference, label it as 'inference' in the JSON." + ) + last_exc: Exception | None = None + for attempt in range(max(1, retries + 1)): + try: + raw = _ollama_call( + ("json", "internal"), + prompt, + context=context, + use_history=False, + system_override=system, + ) + cleaned = _strip_code_fence(raw).strip() + if cleaned.startswith("{") and cleaned.endswith("}"): + return json.loads(cleaned) + last = json.loads(_strip_code_fence(cleaned)) + if isinstance(last, dict): + return last + except Exception as exc: # noqa: BLE001 + last_exc = exc + time.sleep(min(2, 2 ** attempt)) + if last_exc: + return {} + return {} + + +def _fact_pack_lines( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]] | None, +) -> list[str]: + raw = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + lines: list[str] = [] + for line in raw.splitlines(): + trimmed = line.strip() + if not trimmed or trimmed.lower().startswith("facts"): + continue + lines.append(trimmed) + return lines + + +def _fact_pack_text(lines: list[str]) -> str: + labeled = [f"F{idx + 1}: {line}" for idx, line in enumerate(lines)] + return "Fact pack:\n" + "\n".join(labeled) + + +def _open_ended_system() -> str: + return ( + "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " + "Use ONLY the provided fact pack and recent chat as your evidence. " + "You may draw light inferences if you label them as such. " + "Write concise, human sentences, not a list. " + "If the question is subjective, share a light opinion grounded in facts. " + "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " + "Avoid repeating the exact same observation as the last response if possible. " + "Do not invent numbers or facts. " + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100)." + ) + + +def _candidate_note(candidate: dict[str, Any]) -> str: + claim = str(candidate.get("claim") or candidate.get("summary") or "") + return claim[:160] + ("…" if len(claim) > 160 else "") + + +def _ensure_scores(answer: str) -> str: + text = answer.strip() + lines = [line for line in text.splitlines() if line.strip()] + has_relevance = any(line.lower().startswith("relevance:") for line in lines) + has_satisfaction = any(line.lower().startswith("satisfaction:") for line in lines) + has_confidence = any("confidence:" in line.lower() for line in lines) + if not has_confidence: + lines.append("Confidence: medium") + if not has_relevance: + lines.append("Relevance: 70") + if not has_satisfaction: + lines.append("Satisfaction: 70") + return "\n".join(lines) + + +def _open_ended_fast( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + state: ThoughtState | None = None, +) -> str: + if state: + state.update("synthesizing", step=2) + synthesis_prompt = ( + "You are given a question and a fact pack. " + "Answer in 2-4 sentences, using only facts from the pack. " + "Pick one or two facts that best fit the question and explain why they matter. " + "If the question is subjective, add a light opinion grounded in those facts. " + "Do not list raw facts; speak naturally. " + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" + f"Question: {prompt}" + ) + context = _append_history_context(fact_pack, history_lines) + reply = _ollama_call( + ("fast", "open"), + synthesis_prompt, + context=context, + use_history=False, + system_override=_open_ended_system(), + ) + return _ensure_scores(reply) + + +def _interpret_open_question( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], +) -> dict[str, Any]: + prompt_text = ( + "Analyze the question against the fact pack. " + "Return JSON: {\"focus\":\"...\",\"preference\":\"balanced|novelty|utilization|stability|risk\"," + "\"notes\":\"...\"}. " + "Use only the fact pack." + ) + context = _append_history_context(fact_pack, history_lines) + analysis = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) + if not isinstance(analysis, dict): + return {"focus": "cluster snapshot", "preference": "balanced", "notes": ""} + preference = analysis.get("preference") or "balanced" + if preference not in ("balanced", "novelty", "utilization", "stability", "risk"): + preference = "balanced" + analysis["preference"] = preference + analysis.setdefault("focus", "cluster snapshot") + analysis.setdefault("notes", "") + return analysis + + +def _select_insights( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + state: ThoughtState, +) -> list[dict[str, Any]]: + insight_prompt = ( + "From the fact pack, select 3-5 candidate insights that could answer the question. " + "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"]," + "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\"}]}. " + "Use only the fact pack." + ) + state.update("drafting candidates", step=2) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call(insight_prompt + f" Question: {prompt}", context=context) + insights = result.get("insights") if isinstance(result, dict) else None + if not isinstance(insights, list): + return [] + cleaned: list[dict[str, Any]] = [] + for item in insights: + if not isinstance(item, dict): + continue + if not item.get("summary") or not item.get("fact_ids"): + continue + cleaned.append(item) + state.update("drafting candidates", step=2, note=_candidate_note(item)) + return cleaned + + +def _score_insight(insight: dict[str, Any], preference: str) -> float: + relevance = insight.get("relevance") if isinstance(insight.get("relevance"), (int, float)) else 0.0 + novelty = insight.get("novelty") if isinstance(insight.get("novelty"), (int, float)) else 0.0 + if preference == "novelty": + return 0.4 * relevance + 0.6 * novelty + if preference == "utilization": + return 0.7 * relevance + 0.3 * novelty + if preference == "stability": + return 0.7 * relevance + 0.3 * novelty + if preference == "risk": + return 0.6 * relevance + 0.4 * novelty + return 0.6 * relevance + 0.4 * novelty + + +def _open_ended_deep( + prompt: str, + *, + fact_pack: str, + fact_ids: set[str], + history_lines: list[str], + state: ThoughtState | None = None, +) -> str: + state = state or ThoughtState() + if not fact_ids: + return _ensure_scores("I don't have enough data to answer that.") + state.total_steps = 6 + state.update("planning", step=1) + analysis = _interpret_open_question(prompt, fact_pack=fact_pack, history_lines=history_lines) + state.update("planning", step=1, note=str(analysis.get("focus") or "")) + + candidates = _select_insights(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state) + state.update("verifying", step=3) + filtered: list[dict[str, Any]] = [] + for cand in candidates: + cites = cand.get("fact_ids") if isinstance(cand.get("fact_ids"), list) else [] + if cites and not all(cite in fact_ids for cite in cites): + continue + filtered.append(cand) + if not filtered: + filtered = candidates + + preference = analysis.get("preference", "balanced") + ranked = sorted(filtered, key=lambda item: _score_insight(item, preference), reverse=True) + top = ranked[:2] + state.update("synthesizing", step=4) + synth_prompt = ( + "Use the question, fact pack, and selected insights to craft a concise answer. " + "Write 2-4 sentences. Explain why the selected insights stand out. " + "If the question is subjective, include a light opinion grounded in facts. " + "Avoid repeating the same observation as the last response if possible. " + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" + f"Question: {prompt}\n" + f"Interpretation: {json.dumps(analysis, ensure_ascii=False)}\n" + f"Selected: {json.dumps(top, ensure_ascii=False)}" + ) + context = _append_history_context(fact_pack, history_lines) + reply = _ollama_call( + ("deep", "open"), + synth_prompt, + context=context, + use_history=False, + system_override=_open_ended_system(), + ) + state.update("done", step=6) + return _ensure_scores(reply) + + +def open_ended_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]], + history_lines: list[str], + mode: str, + state: ThoughtState | None = None, +) -> str: + lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + if not lines: + return _ensure_scores("I don't have enough data to answer that.") + fact_pack = _fact_pack_text(lines) + fact_ids = {f"F{i+1}" for i in range(len(lines))} + if mode == "fast": + return _open_ended_fast(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state) + return _open_ended_deep(prompt, fact_pack=fact_pack, fact_ids=fact_ids, history_lines=history_lines, state=state) + + +def _non_cluster_reply(prompt: str) -> str: + return _ensure_scores( + "I focus on the Atlas/Othrys cluster and don't have enough data to answer that." + ) + + # Internal HTTP endpoint for cluster answers (website uses this). class _AtlasbotHandler(BaseHTTPRequestHandler): server_version = "AtlasbotHTTP/1.0" @@ -2466,6 +2773,9 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): self._write_json(400, {"error": "missing_prompt"}) return cleaned = _strip_bot_mention(prompt) + mode = str(payload.get("mode") or "fast").lower() + if mode not in ("fast", "deep"): + mode = "fast" snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) @@ -2491,34 +2801,30 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): ) fallback = "I don't have enough data to answer that." if cluster_query: - facts_answer = cluster_answer( - cleaned, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history_lines, - ) open_ended = _is_subjective_query(cleaned) or _knowledge_intent(cleaned) if open_ended: - llm_context = _append_history_context(context, history_lines) - answer = ollama_reply( - ("http", "internal"), + answer = open_ended_answer( cleaned, - context=llm_context, - fallback=facts_answer or fallback, - use_history=False, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history_lines, + mode=mode, + state=None, ) else: - answer = facts_answer or fallback + answer = ( + cluster_answer( + cleaned, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history_lines, + ) + or fallback + ) else: - llm_prompt = cleaned - answer = ollama_reply( - ("http", "internal"), - llm_prompt, - context=context, - fallback=fallback, - use_history=False, - ) + answer = _non_cluster_reply(cleaned) self._write_json(200, {"answer": answer}) @@ -2760,8 +3066,15 @@ def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str: summary = "\n".join(parts).strip() return _format_confidence(summary, "medium") if summary else "" -def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = True) -> str: - system = ( +def _ollama_call( + hist_key, + prompt: str, + *, + context: str, + use_history: bool = True, + system_override: str | None = None, +) -> str: + system = system_override or ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " "Use the provided context and facts as your source of truth. " @@ -2877,6 +3190,47 @@ def ollama_reply_with_thinking( thread.join(timeout=1) return result["reply"] or fallback or "Model backend is busy. Try again in a moment." + +def open_ended_with_thinking( + token: str, + room: str, + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]], + history_lines: list[str], + mode: str, +) -> str: + result: dict[str, str] = {"reply": ""} + done = threading.Event() + total_steps = 2 if mode == "fast" else 6 + state = ThoughtState(total_steps=total_steps) + + def worker(): + result["reply"] = open_ended_answer( + prompt, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history_lines, + mode=mode, + state=state, + ) + done.set() + + thread = threading.Thread(target=worker, daemon=True) + thread.start() + if not done.wait(2.0): + send_msg(token, room, "Thinking…") + heartbeat = max(10, THINKING_INTERVAL_SEC) + next_heartbeat = time.monotonic() + heartbeat + while not done.wait(max(0, next_heartbeat - time.monotonic())): + send_msg(token, room, state.status_line()) + next_heartbeat += heartbeat + thread.join(timeout=1) + return result["reply"] or "Model backend is busy. Try again in a moment." + def sync_loop(token: str, room_id: str): since = None try: @@ -2931,6 +3285,7 @@ def sync_loop(token: str, room_id: str): cleaned_body = _strip_bot_mention(body) lower_body = cleaned_body.lower() + mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep") # Only do live cluster introspection in DMs. allow_tools = is_dm @@ -2984,39 +3339,34 @@ def sync_loop(token: str, room_id: str): fallback = "I don't have enough data to answer that." if cluster_query: - facts_answer = cluster_answer( - cleaned_body, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history[hist_key], - ) open_ended = _is_subjective_query(cleaned_body) or _knowledge_intent(cleaned_body) if open_ended: - llm_context = _append_history_context(context, history[hist_key]) - reply = ollama_reply_with_thinking( + reply = open_ended_with_thinking( token, rid, - hist_key, cleaned_body, - context=llm_context, - fallback=facts_answer or fallback, - use_history=False, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history[hist_key], + mode=mode if mode in ("fast", "deep") else "deep", ) else: - reply = facts_answer or fallback + reply = ( + cluster_answer( + cleaned_body, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history[hist_key], + ) + or fallback + ) else: - llm_prompt = cleaned_body - reply = ollama_reply_with_thinking( - token, - rid, - hist_key, - llm_prompt, - context=context, - fallback=fallback, - use_history=False, - ) + reply = _non_cluster_reply(cleaned_body) send_msg(token, rid, reply) + history[hist_key].append(f"Atlas: {reply}") + history[hist_key] = history[hist_key][-80:] def login_with_retry(): last_err = None From e486245aaf0fb4f29128e1ebcf3a32d827e3145d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:09:48 -0300 Subject: [PATCH 359/416] atlasbot: guard open-ended LLM calls --- services/comms/scripts/atlasbot/bot.py | 28 ++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index aa7e614..47458ea 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2530,6 +2530,26 @@ def _open_ended_system() -> str: ) +def _ollama_call_safe( + hist_key, + prompt: str, + *, + context: str, + fallback: str, + system_override: str | None = None, +) -> str: + try: + return _ollama_call( + hist_key, + prompt, + context=context, + use_history=False, + system_override=system_override, + ) + except Exception: + return fallback + + def _candidate_note(candidate: dict[str, Any]) -> str: claim = str(candidate.get("claim") or candidate.get("summary") or "") return claim[:160] + ("…" if len(claim) > 160 else "") @@ -2569,11 +2589,11 @@ def _open_ended_fast( f"Question: {prompt}" ) context = _append_history_context(fact_pack, history_lines) - reply = _ollama_call( + reply = _ollama_call_safe( ("fast", "open"), synthesis_prompt, context=context, - use_history=False, + fallback="I don't have enough data to answer that.", system_override=_open_ended_system(), ) return _ensure_scores(reply) @@ -2690,11 +2710,11 @@ def _open_ended_deep( f"Selected: {json.dumps(top, ensure_ascii=False)}" ) context = _append_history_context(fact_pack, history_lines) - reply = _ollama_call( + reply = _ollama_call_safe( ("deep", "open"), synth_prompt, context=context, - use_history=False, + fallback="I don't have enough data to answer that.", system_override=_open_ended_system(), ) state.update("done", step=6) From 65e50d1923c53ef459c723c1c07d3342f384358e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:11:58 -0300 Subject: [PATCH 360/416] atlasbot: bump rollout checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index cc628dd..97567eb 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-70 + checksum/atlasbot-configmap: manual-atlasbot-71 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 27e8a770448033a5e03227eee68206911b17eed3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:16:47 -0300 Subject: [PATCH 361/416] atlasbot: add model fallback and rollout --- services/comms/atlasbot-deployment.yaml | 4 +++- services/comms/scripts/atlasbot/bot.py | 24 +++++++++++++++++------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 97567eb..7414f1e 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-71 + checksum/atlasbot-configmap: manual-atlasbot-72 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -83,6 +83,8 @@ spec: value: http://ollama.ai.svc.cluster.local:11434 - name: OLLAMA_MODEL value: qwen2.5:14b-instruct + - name: OLLAMA_FALLBACK_MODEL + value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC value: "600" - name: ATLASBOT_THINKING_INTERVAL_SEC diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 47458ea..2c93b75 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -17,6 +17,7 @@ ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") +FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "") API_KEY = os.environ.get("CHAT_API_KEY", "") OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090")) @@ -3133,14 +3134,23 @@ def _ollama_call( if lock: lock.acquire() try: - with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: - data = json.loads(resp.read().decode()) - msg = data.get("message") if isinstance(data, dict) else None - if isinstance(msg, dict): - raw_reply = msg.get("content") + try: + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: + data = json.loads(resp.read().decode()) + except error.HTTPError as exc: + if exc.code == 404 and FALLBACK_MODEL and FALLBACK_MODEL != payload["model"]: + payload["model"] = FALLBACK_MODEL + r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers) + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: + data = json.loads(resp.read().decode()) else: - raw_reply = data.get("response") or data.get("reply") or data - reply = _normalize_reply(raw_reply) or "I'm here to help." + raise + msg = data.get("message") if isinstance(data, dict) else None + if isinstance(msg, dict): + raw_reply = msg.get("content") + else: + raw_reply = data.get("response") or data.get("reply") or data + reply = _normalize_reply(raw_reply) or "I'm here to help." if use_history: history[hist_key].append(f"Atlas: {reply}") return reply From 832d5acf6847512986b0fc896ef17050df1fde19 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:27:19 -0300 Subject: [PATCH 362/416] atlasbot: improve metric parsing and cluster intent --- services/comms/scripts/atlasbot/bot.py | 48 +++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 2c93b75..b9bc0e6 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -190,6 +190,8 @@ _INSIGHT_HINT_WORDS = { "surprising", "weird", "odd", + "unusual", + "outlier", "fun", "cool", "unique", @@ -540,6 +542,13 @@ def _detect_operation(q: str) -> str | None: def _detect_metric(q: str) -> str | None: tokens = set(_tokens(q)) + expanded: set[str] = set(tokens) + for token in list(tokens): + for part in re.split(r"[-_]", token): + part = part.strip() + if len(part) >= 2: + expanded.add(part) + tokens = expanded for metric, phrases in METRIC_HINTS.items(): for phrase in phrases: if " " in phrase: @@ -1271,6 +1280,19 @@ def snapshot_metric_answer( pending = metrics.get("pods_pending") failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") + status_terms = ("running", "pending", "failed", "succeeded", "completed") + if sum(1 for term in status_terms if term in q) > 1: + parts = [] + if running is not None: + parts.append(f"running {running:.0f}") + if pending is not None: + parts.append(f"pending {pending:.0f}") + if failed is not None: + parts.append(f"failed {failed:.0f}") + if succeeded is not None: + parts.append(f"succeeded {succeeded:.0f}") + if parts: + return _format_confidence(f"Pods: {', '.join(parts)}.", "high") if "pending" in q and pending is not None: return _format_confidence(f"Pending pods: {pending:.0f}.", "high") if "failed" in q and failed is not None: @@ -1345,7 +1367,17 @@ def structured_answer( if hw_line: return _format_confidence(hw_line, "high") - if op == "top" and metric is None: + if entity == "node" and op == "status" and metric is None: + summary = _nodes_summary_line(inventory, snapshot) + if summary: + return _format_confidence(summary, "high") + + if entity == "node" and metric is None and any(word in q for word in ("hardware", "architecture", "class", "mix")): + hw_line = _hardware_mix_line(inventory) + if hw_line: + return _format_confidence(hw_line, "medium") + + if op == "top" and metric is None and not any(word in q for word in ("hardware", "architecture", "class")): metric = "cpu" # Metrics-first when a metric or top operation is requested. @@ -2807,8 +2839,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): inventory=inventory, workloads=workloads, ) - cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) or ( - _is_subjective_query(cleaned) and history_cluster + cluster_query = ( + _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) + or history_cluster + or _knowledge_intent(cleaned) + or _is_subjective_query(cleaned) ) context = "" if cluster_query: @@ -3347,8 +3382,11 @@ def sync_loop(token: str, room_id: str): inventory=inventory, workloads=workloads, ) - cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) or ( - _is_subjective_query(cleaned_body) and history_cluster + cluster_query = ( + _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) + or history_cluster + or _knowledge_intent(cleaned_body) + or _is_subjective_query(cleaned_body) ) context = "" if cluster_query: From 2fe763189d95c3a21d62e40b9fb5b7695ba0c3cf Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:27:52 -0300 Subject: [PATCH 363/416] atlasbot: roll pod after metric parsing update --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7414f1e..4e27b5a 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-72 + checksum/atlasbot-configmap: manual-atlasbot-73 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From ba16f5119bf5c3d80b58761e1e0300fe6e6bd381 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:43:37 -0300 Subject: [PATCH 364/416] monitoring: unify gpu namespace usage --- scripts/dashboards_render_atlas.py | 47 ++++++++++++++----- services/monitoring/dashboards/atlas-gpu.json | 6 +-- .../monitoring/dashboards/atlas-overview.json | 2 +- .../monitoring/grafana-dashboard-gpu.yaml | 6 +-- .../grafana-dashboard-overview.yaml | 2 +- 5 files changed, 42 insertions(+), 21 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 445de94..2e5c73b 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -208,32 +208,53 @@ def namespace_ram_raw(scope_var): def namespace_gpu_usage_instant(scope_var): - dcgm = f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)" - jetson = jetson_gpu_usage_by_namespace(scope_var) - merged = ( - f'label_replace({dcgm}, "source", "dcgm", "", "") ' - f'or label_replace({jetson}, "source", "jetson", "", "")' - ) - return f"sum by (namespace) ({merged})" + return gpu_usage_by_namespace(scope_var) def jetson_gpu_util_by_node(): return 'max by (node) (jetson_gr3d_freq_percent{node!=""})' -def jetson_gpu_util_by_hostname(): +def dcgm_gpu_util_by_node(): + dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")' + dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")' return ( - 'label_replace(max by (node) (jetson_gr3d_freq_percent{node!=""}), ' - '"Hostname", "$1", "node", "(.*)")' + "avg by (node) (" + f"{dcgm_ns} * on(namespace,pod) group_left(node) " + 'kube_pod_info{namespace="monitoring"}' + ")" ) -def jetson_gpu_requests(scope_var): +def gpu_util_by_node(): + return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}" + + +def gpu_util_by_hostname(): + return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")' + + +def gpu_node_labels(): + return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}' + + +def gpu_requests_by_namespace_node(scope_var): return ( "sum by (namespace,node) (" f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} ' "* on(namespace,pod) group_left(node) kube_pod_info " - '* on(node) group_left(label_jetson) kube_node_labels{label_jetson="true"}' + f"* on(node) group_left() {gpu_node_labels()}" + ")" + ) + + +def gpu_usage_by_namespace(scope_var): + requests_by_ns = gpu_requests_by_namespace_node(scope_var) + total_by_node = f"sum by (node) ({requests_by_ns})" + return ( + "sum by (namespace) (" + f"({requests_by_ns}) / clamp_min({total_by_node}, 1) " + f"* on(node) group_left() {gpu_util_by_node()}" ")" ) @@ -2695,7 +2716,7 @@ def build_gpu_dashboard(): timeseries_panel( 3, "GPU Util by Node", - f'(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{{pod!=""}})) or ({jetson_gpu_util_by_hostname()})', + gpu_util_by_hostname(), {"h": 8, "w": 12, "x": 0, "y": 8}, unit="percent", legend="{{Hostname}}", diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 132f276..8542c5e 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -89,7 +89,7 @@ }, "targets": [ { - "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))", + "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -126,7 +126,7 @@ }, "targets": [ { - "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))", + "expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index b212c8c..31b7867 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1901,7 +1901,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 55f63e8..8d3a3dd 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -98,7 +98,7 @@ data: }, "targets": [ { - "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))", + "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -135,7 +135,7 @@ data: }, "targets": [ { - "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))", + "expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index a899002..2a7cc2b 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1910,7 +1910,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } From 38c8d08ab485cc21976fef526145b20db71c2b3a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:46:58 -0300 Subject: [PATCH 365/416] monitoring: fix gpu idle label --- scripts/dashboards_render_atlas.py | 4 ++-- services/monitoring/dashboards/atlas-gpu.json | 4 ++-- services/monitoring/dashboards/atlas-overview.json | 2 +- services/monitoring/grafana-dashboard-gpu.yaml | 4 ++-- services/monitoring/grafana-dashboard-overview.yaml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 2e5c73b..5db798d 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -243,7 +243,7 @@ def gpu_requests_by_namespace_node(scope_var): "sum by (namespace,node) (" f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} ' "* on(namespace,pod) group_left(node) kube_pod_info " - f"* on(node) group_left() {gpu_node_labels()}" + f"* on(node) group_left() ({gpu_node_labels()})" ")" ) @@ -254,7 +254,7 @@ def gpu_usage_by_namespace(scope_var): return ( "sum by (namespace) (" f"({requests_by_ns}) / clamp_min({total_by_node}, 1) " - f"* on(node) group_left() {gpu_util_by_node()}" + f"* on(node) group_left() ({gpu_util_by_node()})" ")" ) diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 8542c5e..6f993d9 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -89,7 +89,7 @@ }, "targets": [ { - "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))", + "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 31b7867..1f8635b 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1901,7 +1901,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 8d3a3dd..3407963 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -98,7 +98,7 @@ data: }, "targets": [ { - "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))", + "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 2a7cc2b..fdfe1a7 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1910,7 +1910,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } From e97aaafed94dbd7bb951cfe5faae06cf43d4211f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:52:07 -0300 Subject: [PATCH 366/416] atlasbot: refine open-ended reasoning --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 410 ++++++++++++++++++++++-- 2 files changed, 378 insertions(+), 34 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4e27b5a..5e5bc05 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-73 + checksum/atlasbot-configmap: manual-atlasbot-74 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index b9bc0e6..0176293 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -138,6 +138,7 @@ CLUSTER_HINT_WORDS = { "cluster", "k8s", "kubernetes", + "health", "node", "nodes", "hardware", @@ -211,6 +212,7 @@ _OVERVIEW_HINT_WORDS = { "explain", "tell me about", "what do you know", + "health", } _OLLAMA_LOCK = threading.Lock() @@ -1220,6 +1222,8 @@ def snapshot_metric_answer( q = normalize_query(prompt) metric = _detect_metric(q) op = _detect_operation(q) + if op == "list" and metric in {"cpu", "ram", "net", "io"}: + op = "top" include_hw, exclude_hw = _detect_hardware_filters(q) nodes_in_query = _extract_titan_nodes(q) only_workers = "worker" in q or "workers" in q @@ -1340,6 +1344,8 @@ def structured_answer( tokens = _tokens(q) op = _detect_operation(q) metric = _detect_metric(q) + if op == "list" and metric in {"cpu", "ram", "net", "io"}: + op = "top" entity = _detect_entity(q) include_hw, exclude_hw = _detect_hardware_filters(q) nodes_in_query = _extract_titan_nodes(q) @@ -1646,6 +1652,37 @@ def _is_insight_query(query: str) -> bool: return False +_FOLLOWUP_HINTS = ( + "what about", + "how about", + "and what", + "and how", + "tell me more", + "anything else", + "something else", + "that one", + "those", + "them", + "it", + "this", + "that", + "else", + "another", + "again", +) + + +def _is_followup_query(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + if any(hint in q for hint in _FOLLOWUP_HINTS): + return True + if len(q.split()) <= 3 and not any(word in q for word in _INSIGHT_HINT_WORDS): + return True + return False + + def _is_subjective_query(query: str) -> bool: q = normalize_query(query) if not q: @@ -2541,6 +2578,12 @@ def _fact_pack_lines( if not trimmed or trimmed.lower().startswith("facts"): continue lines.append(trimmed) + if _knowledge_intent(prompt) or _doc_intent(prompt) or _is_overview_query(prompt): + kb_titles = kb_retrieve_titles(prompt, limit=4) + if kb_titles: + for kb_line in kb_titles.splitlines(): + if kb_line.strip(): + lines.append(kb_line.strip()) return lines @@ -2549,12 +2592,194 @@ def _fact_pack_text(lines: list[str]) -> str: return "Fact pack:\n" + "\n".join(labeled) +_ALLOWED_INSIGHT_TAGS = { + "availability", + "architecture", + "database", + "hardware", + "inventory", + "node_detail", + "os", + "pods", + "utilization", + "workloads", + "workers", +} + +_DYNAMIC_TAGS = {"availability", "database", "pods", "utilization", "workloads"} +_INVENTORY_TAGS = {"hardware", "architecture", "inventory", "workers", "node_detail", "os"} + + +def _fact_line_tags(line: str) -> set[str]: + text = (line or "").lower() + tags: set[str] = set() + if any(key in text for key in ("nodes_total", "ready", "not_ready", "workers_ready", "workers_not_ready")): + tags.add("availability") + if "nodes_by_arch" in text or "arch " in text or "architecture" in text: + tags.add("architecture") + if any(key in text for key in ("rpi", "jetson", "amd64", "arm64", "non_raspberry_pi")): + tags.update({"hardware", "inventory"}) + if "control_plane_nodes" in text or "worker_nodes" in text: + tags.add("inventory") + if any(key in text for key in ("hottest_", "node_usage", "cpu=", "ram=", "net=", "io=")): + tags.add("utilization") + if "postgres_" in text or "postgres connections" in text: + tags.add("database") + if "pods_" in text or "pod phases" in text: + tags.add("pods") + if "workloads" in text or "primary_node" in text: + tags.add("workloads") + if "node_details" in text: + tags.add("node_detail") + if "os mix" in text or "os" in text: + tags.add("os") + return tags & _ALLOWED_INSIGHT_TAGS + + +def _fact_pack_meta(lines: list[str]) -> dict[str, dict[str, Any]]: + meta: dict[str, dict[str, Any]] = {} + for idx, line in enumerate(lines): + fid = f"F{idx + 1}" + tags = sorted(_fact_line_tags(line)) + meta[fid] = {"tags": tags} + return meta + + +def _history_tags(history_lines: list[str]) -> set[str]: + tags: set[str] = set() + for line in history_lines[-6:]: + tags.update(_fact_line_tags(line)) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _seed_insights( + lines: list[str], + fact_meta: dict[str, dict[str, Any]], + *, + limit: int = 6, +) -> list[dict[str, Any]]: + priority = [ + "utilization", + "database", + "pods", + "workloads", + "availability", + "hardware", + "architecture", + "inventory", + ] + seeds: list[dict[str, Any]] = [] + used_tags: set[str] = set() + for tag in priority: + for idx, line in enumerate(lines): + fid = f"F{idx + 1}" + tags = set(fact_meta.get(fid, {}).get("tags") or []) + if tag not in tags or fid in {s["fact_ids"][0] for s in seeds}: + continue + summary = line.lstrip("- ").strip() + seeds.append( + { + "summary": summary, + "fact_ids": [fid], + "relevance": 0.5, + "novelty": 0.5, + "rationale": "seeded from fact pack", + "tags": sorted(tags), + } + ) + used_tags.update(tags) + if len(seeds) >= limit: + return seeds + return seeds + + +def _insight_tags(insight: dict[str, Any], fact_meta: dict[str, dict[str, Any]]) -> set[str]: + tags: set[str] = set() + for fid in insight.get("fact_ids") if isinstance(insight.get("fact_ids"), list) else []: + tags.update(fact_meta.get(fid, {}).get("tags") or []) + raw_tags = insight.get("tags") if isinstance(insight.get("tags"), list) else [] + tags.update(t for t in raw_tags if isinstance(t, str)) + summary = insight.get("summary") or insight.get("claim") or "" + if isinstance(summary, str): + tags.update(_fact_line_tags(summary)) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _insight_score( + insight: dict[str, Any], + *, + preference: str, + prefer_tags: set[str], + avoid_tags: set[str], + history_tags: set[str], + fact_meta: dict[str, dict[str, Any]], +) -> float: + base = _score_insight(insight, preference) + tags = _insight_tags(insight, fact_meta) + if prefer_tags and tags: + base += 0.15 * len(tags & prefer_tags) + if avoid_tags and tags: + base -= 0.12 * len(tags & avoid_tags) + if history_tags and tags: + base -= 0.08 * len(tags & history_tags) + if preference == "novelty": + if tags & _DYNAMIC_TAGS: + base += 0.12 + if tags & _INVENTORY_TAGS: + base -= 0.08 + return base + + +def _select_diverse_insights( + candidates: list[dict[str, Any]], + *, + preference: str, + prefer_tags: set[str], + avoid_tags: set[str], + history_tags: set[str], + fact_meta: dict[str, dict[str, Any]], + count: int = 2, +) -> list[dict[str, Any]]: + scored: list[tuple[float, dict[str, Any]]] = [] + for item in candidates: + tags = _insight_tags(item, fact_meta) + item["tags"] = sorted(tags) + score = _insight_score( + item, + preference=preference, + prefer_tags=prefer_tags, + avoid_tags=avoid_tags, + history_tags=history_tags, + fact_meta=fact_meta, + ) + scored.append((score, item)) + scored.sort(key=lambda pair: pair[0], reverse=True) + picked: list[dict[str, Any]] = [] + used_tags: set[str] = set() + for _, item in scored: + tags = set(item.get("tags") or []) + if used_tags and tags and tags <= used_tags and len(picked) < count: + continue + picked.append(item) + used_tags.update(tags) + if len(picked) >= count: + break + if len(picked) < count: + for _, item in scored: + if item in picked: + continue + picked.append(item) + if len(picked) >= count: + break + return picked + + def _open_ended_system() -> str: return ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Use ONLY the provided fact pack and recent chat as your evidence. " "You may draw light inferences if you label them as such. " - "Write concise, human sentences, not a list. " + "Write concise, human sentences with a helpful, calm tone (not a list). " "If the question is subjective, share a light opinion grounded in facts. " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible. " @@ -2608,18 +2833,52 @@ def _open_ended_fast( *, fact_pack: str, history_lines: list[str], + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + tags_available: set[str], + history_tags: set[str], state: ThoughtState | None = None, ) -> str: if state: - state.update("synthesizing", step=2) + state.update("planning", step=1) + analysis = _interpret_open_question( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + tags_available=tags_available, + avoid_tags=history_tags, + state=state, + ) + candidates = _select_insights( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + state=state or ThoughtState(), + analysis=analysis, + fact_lines=fact_lines, + fact_meta=fact_meta, + avoid_tags=history_tags, + ) + prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)} + selected = _select_diverse_insights( + candidates, + preference=analysis.get("preference", "balanced"), + prefer_tags=prefer_tags, + avoid_tags=history_tags, + history_tags=history_tags, + fact_meta=fact_meta, + count=2, + ) + if state: + state.update("synthesizing", step=3) synthesis_prompt = ( - "You are given a question and a fact pack. " - "Answer in 2-4 sentences, using only facts from the pack. " - "Pick one or two facts that best fit the question and explain why they matter. " - "If the question is subjective, add a light opinion grounded in those facts. " - "Do not list raw facts; speak naturally. " + "Use the question, fact pack, and selected insights to answer in 2-4 sentences. " + "Speak naturally, not as a list. " + "If the question is subjective, add a light opinion grounded in facts. " + "Avoid repeating the exact same observation as the most recent response if possible. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" - f"Question: {prompt}" + f"Question: {prompt}\n" + f"Selected: {json.dumps(selected, ensure_ascii=False)}" ) context = _append_history_context(fact_pack, history_lines) reply = _ollama_call_safe( @@ -2637,23 +2896,36 @@ def _interpret_open_question( *, fact_pack: str, history_lines: list[str], + tags_available: set[str], + avoid_tags: set[str], + state: ThoughtState | None = None, ) -> dict[str, Any]: + tags_list = ", ".join(sorted(tags_available)) if tags_available else "none" + avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" prompt_text = ( "Analyze the question against the fact pack. " "Return JSON: {\"focus\":\"...\",\"preference\":\"balanced|novelty|utilization|stability|risk\"," - "\"notes\":\"...\"}. " + "\"tags\":[\"...\"] ,\"notes\":\"...\"}. " + "If the question implies interesting/unique/unconventional/cool, set preference to novelty " + "and prefer dynamic tags (utilization/pods/database/availability) when possible. " + f"Use only these tags if relevant: {tags_list}. Avoid tags: {avoid_list}. " "Use only the fact pack." ) context = _append_history_context(fact_pack, history_lines) analysis = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) if not isinstance(analysis, dict): - return {"focus": "cluster snapshot", "preference": "balanced", "notes": ""} + analysis = {"focus": "cluster snapshot", "preference": "balanced", "notes": "", "tags": []} preference = analysis.get("preference") or "balanced" if preference not in ("balanced", "novelty", "utilization", "stability", "risk"): preference = "balanced" analysis["preference"] = preference analysis.setdefault("focus", "cluster snapshot") analysis.setdefault("notes", "") + tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else [] + clean_tags = {t for t in tags if isinstance(t, str)} + analysis["tags"] = sorted(clean_tags & tags_available) + if state: + state.update("planning", step=1, note=str(analysis.get("focus") or "")) return analysis @@ -2663,27 +2935,41 @@ def _select_insights( fact_pack: str, history_lines: list[str], state: ThoughtState, + analysis: dict[str, Any], + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + avoid_tags: set[str], ) -> list[dict[str, Any]]: + preferred_tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else [] + prefer_list = ", ".join(sorted({t for t in preferred_tags if isinstance(t, str)})) + avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" + available_list = ", ".join(sorted({t for t in _ALLOWED_INSIGHT_TAGS})) insight_prompt = ( "From the fact pack, select 3-5 candidate insights that could answer the question. " "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"]," - "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\"}]}. " - "Use only the fact pack." + "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\",\"tags\":[\"...\"]}]}. " + f"Available tags: {available_list}. Prefer tags: {prefer_list or 'none'}. Avoid tags: {avoid_list}. " + "Use only the fact pack and provided tags." ) state.update("drafting candidates", step=2) context = _append_history_context(fact_pack, history_lines) result = _ollama_json_call(insight_prompt + f" Question: {prompt}", context=context) insights = result.get("insights") if isinstance(result, dict) else None if not isinstance(insights, list): - return [] + insights = [] cleaned: list[dict[str, Any]] = [] for item in insights: if not isinstance(item, dict): continue if not item.get("summary") or not item.get("fact_ids"): continue + tags = _insight_tags(item, fact_meta) + item["tags"] = sorted(tags) cleaned.append(item) state.update("drafting candidates", step=2, note=_candidate_note(item)) + seeds = _seed_insights(fact_lines, fact_meta) + for seed in seeds: + cleaned.append(seed) return cleaned @@ -2707,18 +2993,36 @@ def _open_ended_deep( fact_pack: str, fact_ids: set[str], history_lines: list[str], + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + tags_available: set[str], + history_tags: set[str], state: ThoughtState | None = None, ) -> str: state = state or ThoughtState() if not fact_ids: return _ensure_scores("I don't have enough data to answer that.") - state.total_steps = 6 - state.update("planning", step=1) - analysis = _interpret_open_question(prompt, fact_pack=fact_pack, history_lines=history_lines) - state.update("planning", step=1, note=str(analysis.get("focus") or "")) + state.total_steps = 7 + analysis = _interpret_open_question( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + tags_available=tags_available, + avoid_tags=history_tags, + state=state, + ) - candidates = _select_insights(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state) - state.update("verifying", step=3) + candidates = _select_insights( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + state=state, + analysis=analysis, + fact_lines=fact_lines, + fact_meta=fact_meta, + avoid_tags=history_tags, + ) + state.update("verifying", step=3, note="scoring insights") filtered: list[dict[str, Any]] = [] for cand in candidates: cites = cand.get("fact_ids") if isinstance(cand.get("fact_ids"), list) else [] @@ -2729,9 +3033,17 @@ def _open_ended_deep( filtered = candidates preference = analysis.get("preference", "balanced") - ranked = sorted(filtered, key=lambda item: _score_insight(item, preference), reverse=True) - top = ranked[:2] - state.update("synthesizing", step=4) + prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)} + top = _select_diverse_insights( + filtered, + preference=preference, + prefer_tags=prefer_tags, + avoid_tags=history_tags, + history_tags=history_tags, + fact_meta=fact_meta, + count=2, + ) + state.update("synthesizing", step=4, note="composing response") synth_prompt = ( "Use the question, fact pack, and selected insights to craft a concise answer. " "Write 2-4 sentences. Explain why the selected insights stand out. " @@ -2740,6 +3052,7 @@ def _open_ended_deep( "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" f"Question: {prompt}\n" f"Interpretation: {json.dumps(analysis, ensure_ascii=False)}\n" + f"Recent tags: {', '.join(sorted(history_tags)) if history_tags else 'none'}\n" f"Selected: {json.dumps(top, ensure_ascii=False)}" ) context = _append_history_context(fact_pack, history_lines) @@ -2750,7 +3063,7 @@ def _open_ended_deep( fallback="I don't have enough data to answer that.", system_override=_open_ended_system(), ) - state.update("done", step=6) + state.update("done", step=7) return _ensure_scores(reply) @@ -2769,9 +3082,31 @@ def open_ended_answer( return _ensure_scores("I don't have enough data to answer that.") fact_pack = _fact_pack_text(lines) fact_ids = {f"F{i+1}" for i in range(len(lines))} + fact_meta = _fact_pack_meta(lines) + tags_available = {tag for entry in fact_meta.values() for tag in entry.get("tags", [])} + history_tags = _history_tags(history_lines) if mode == "fast": - return _open_ended_fast(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state) - return _open_ended_deep(prompt, fact_pack=fact_pack, fact_ids=fact_ids, history_lines=history_lines, state=state) + return _open_ended_fast( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + fact_lines=lines, + fact_meta=fact_meta, + tags_available=tags_available, + history_tags=history_tags, + state=state, + ) + return _open_ended_deep( + prompt, + fact_pack=fact_pack, + fact_ids=fact_ids, + history_lines=history_lines, + fact_lines=lines, + fact_meta=fact_meta, + tags_available=tags_available, + history_tags=history_tags, + state=state, + ) def _non_cluster_reply(prompt: str) -> str: @@ -2826,9 +3161,9 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): self._write_json(400, {"error": "missing_prompt"}) return cleaned = _strip_bot_mention(prompt) - mode = str(payload.get("mode") or "fast").lower() + mode = str(payload.get("mode") or "deep").lower() if mode not in ("fast", "deep"): - mode = "fast" + mode = "deep" snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) @@ -2839,11 +3174,12 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): inventory=inventory, workloads=workloads, ) + followup = _is_followup_query(cleaned) cluster_query = ( _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) - or history_cluster or _knowledge_intent(cleaned) or _is_subjective_query(cleaned) + or (history_cluster and followup) ) context = "" if cluster_query: @@ -2857,7 +3193,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): ) fallback = "I don't have enough data to answer that." if cluster_query: - open_ended = _is_subjective_query(cleaned) or _knowledge_intent(cleaned) + open_ended = ( + _is_subjective_query(cleaned) + or _knowledge_intent(cleaned) + or _is_overview_query(cleaned) + ) if open_ended: answer = open_ended_answer( cleaned, @@ -3068,7 +3408,6 @@ def _knowledge_intent(prompt: str) -> bool: "summary", "describe", "explain", - "what is", ) ) @@ -3269,7 +3608,7 @@ def open_ended_with_thinking( ) -> str: result: dict[str, str] = {"reply": ""} done = threading.Event() - total_steps = 2 if mode == "fast" else 6 + total_steps = 4 if mode == "fast" else 7 state = ThoughtState(total_steps=total_steps) def worker(): @@ -3382,11 +3721,12 @@ def sync_loop(token: str, room_id: str): inventory=inventory, workloads=workloads, ) + followup = _is_followup_query(cleaned_body) cluster_query = ( _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) - or history_cluster or _knowledge_intent(cleaned_body) or _is_subjective_query(cleaned_body) + or (history_cluster and followup) ) context = "" if cluster_query: @@ -3407,7 +3747,11 @@ def sync_loop(token: str, room_id: str): fallback = "I don't have enough data to answer that." if cluster_query: - open_ended = _is_subjective_query(cleaned_body) or _knowledge_intent(cleaned_body) + open_ended = ( + _is_subjective_query(cleaned_body) + or _knowledge_intent(cleaned_body) + or _is_overview_query(cleaned_body) + ) if open_ended: reply = open_ended_with_thinking( token, From 029e4d4ca6f06e65dc63bf0300c244e10fc276ba Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:00:19 -0300 Subject: [PATCH 367/416] monitoring: send grafana alerts via postmark --- services/monitoring/helmrelease.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 8e225d4..6185e59 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -339,7 +339,7 @@ spec: GF_AUTH_ANONYMOUS_ORG_NAME: "Overview" GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer" GF_SMTP_ENABLED: "true" - GF_SMTP_HOST: "mail.bstein.dev:587" + GF_SMTP_HOST: "smtp.postmarkapp.com:587" GF_SMTP_FROM: "no-reply-grafana@bstein.dev" GF_SMTP_FROM_NAME: "Atlas Grafana" GRAFANA_ALERT_EMAILS: "brad@bstein.dev" From 868075426c953dc711f7bb78f4a6327a32300c40 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:22:50 -0300 Subject: [PATCH 368/416] atlasbot: overhaul open-ended reasoning --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 697 +++++++++--------------- 2 files changed, 253 insertions(+), 446 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 5e5bc05..17e2cb2 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-74 + checksum/atlasbot-configmap: manual-atlasbot-75 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 0176293..0668521 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -198,6 +198,8 @@ _INSIGHT_HINT_WORDS = { "unique", "notable", "coolest", + "risk", + "risky", "favorite", "favourite", "trivia", @@ -1641,17 +1643,6 @@ def _hottest_summary_line(metrics: dict[str, Any]) -> str: return "Hot spots: " + "; ".join(parts) + "." -def _is_insight_query(query: str) -> bool: - q = normalize_query(query) - if not q: - return False - if any(word in q for word in _INSIGHT_HINT_WORDS): - return True - if "most" in q and any(word in q for word in ("unusual", "odd", "weird", "unconventional")): - return True - return False - - _FOLLOWUP_HINTS = ( "what about", "how about", @@ -1724,198 +1715,6 @@ def _doc_intent(query: str) -> bool: ) -def _insight_candidates( - inventory: list[dict[str, Any]], - snapshot: dict[str, Any] | None, -) -> list[tuple[str, str, str]]: - metrics = _snapshot_metrics(snapshot) - candidates: list[tuple[str, str, str]] = [] - - nodes_line = _nodes_summary_line(inventory, snapshot) - if nodes_line and "not ready" in nodes_line.lower(): - candidates.append(("availability", nodes_line, "high")) - - hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} - if hottest: - def _hot_node(entry: dict[str, Any]) -> str: - if not isinstance(entry, dict): - return "" - return ( - entry.get("node") - or entry.get("label") - or (entry.get("metric") or {}).get("node") - or "" - ) - - cpu = hottest.get("cpu") if isinstance(hottest.get("cpu"), dict) else {} - cpu_node = _hot_node(cpu) - if cpu_node and cpu.get("value") is not None: - value_fmt = _format_metric_value(str(cpu.get("value")), percent=True) - candidates.append(("cpu", f"The busiest CPU right now is {cpu_node} at about {value_fmt}.", "high")) - ram = hottest.get("ram") if isinstance(hottest.get("ram"), dict) else {} - ram_node = _hot_node(ram) - if ram_node and ram.get("value") is not None: - value_fmt = _format_metric_value(str(ram.get("value")), percent=True) - candidates.append(("ram", f"RAM usage peaks on {ram_node} at about {value_fmt}.", "high")) - - postgres_line = _postgres_summary_line(metrics) - if postgres_line: - candidates.append(("postgres", postgres_line, "high")) - - hardware_insight = _hardware_insight(inventory) - if hardware_insight: - candidates.append(("hardware", hardware_insight, "medium")) - - pods_line = _pods_summary_line(metrics) - if pods_line: - candidates.append(("pods", pods_line, "high")) - - return candidates - - -def _hardware_insight(inventory: list[dict[str, Any]]) -> str: - if not inventory: - return "" - groups = _group_nodes(inventory) - jetsons = groups.get("jetson") or [] - rpi5 = groups.get("rpi5") or [] - rpi4 = groups.get("rpi4") or [] - amd64 = groups.get("amd64") or [] - parts: list[str] = [] - if rpi5: - parts.append(f"rpi5={len(rpi5)}") - if rpi4: - parts.append(f"rpi4={len(rpi4)}") - if jetsons: - jetson_names = ", ".join(jetsons[:2]) - parts.append(f"jetson={len(jetsons)} ({jetson_names})") - if amd64: - parts.append(f"amd64={len(amd64)}") - return ", ".join(parts) - - -def _recent_insight_keys(history_lines: list[str]) -> set[str]: - used: set[str] = set() - for line in history_lines[-10:]: - lower = normalize_query(line) - if not lower: - continue - if "postgres" in lower or "connections" in lower: - used.add("postgres") - if "atlas mixes" in lower or "hardware" in lower or "rpi" in lower or "jetson" in lower: - used.add("hardware") - if "busiest cpu" in lower or "cpu right now" in lower or "cpu " in lower: - used.add("cpu") - if "ram usage" in lower or "memory" in lower: - used.add("ram") - if "pods" in lower: - used.add("pods") - if "not ready" in lower: - used.add("availability") - return used - - -def _select_insight( - prompt: str, - candidates: list[tuple[str, str, str]], - *, - used_keys: set[str] | None = None, -) -> tuple[str, str, str] | None: - if not candidates: - return None - used = used_keys or set() - q = normalize_query(prompt) - prefer_keys: list[str] = [] - if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): - prefer_keys.extend(["hardware", "availability"]) - if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")): - prefer_keys.extend(["hardware", "cpu", "ram"]) - if "interesting" in q and "most interesting" not in q: - prefer_keys.extend(["hardware", "postgres", "cpu", "ram"]) - avoid_used = any(word in q for word in ("another", "else", "different", "other")) or "most interesting" in q - if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: - for candidate in candidates: - if candidate[0] not in used: - return candidate - return candidates[1] - if prefer_keys: - for prefer in prefer_keys: - for key, text, conf in candidates: - if key == prefer and (not avoid_used or key not in used): - return key, text, conf - for prefer in prefer_keys: - for key, text, conf in candidates: - if key == prefer: - return key, text, conf - if used and avoid_used: - for candidate in candidates: - if candidate[0] not in used: - return candidate - return candidates[0] - - -def _format_insight_text(key: str, text: str) -> str: - cleaned = text.strip().rstrip(".") - if not cleaned: - return "" - if key == "hardware": - counts = ( - cleaned.replace("Hardware mix includes ", "") - .replace("Atlas mixes tiny ", "") - .replace("Atlas mixes ", "") - .replace("which is unusual for a homelab cluster", "") - .strip() - .strip(".") - ) - has_jetson = "jetson=" in counts - has_amd64 = "amd64=" in counts - detail = f"mixed hardware stack ({counts})" - if has_jetson and has_amd64: - flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 boxes." - elif has_jetson: - flavor = "It pairs low-power Pis with Jetson accelerators for edge and AI workloads." - elif has_amd64: - flavor = "It mixes low-power Pis with a couple of heavier AMD64 nodes." - else: - flavor = "It is a pretty uniform hardware stack, which is rare for a homelab." - return f"{detail}. {flavor}" - if key == "postgres": - detail = cleaned.replace("Postgres is at ", "") - return f"Postgres is at {detail}; that feels like healthy, steady load rather than strain." - if key == "pods": - detail = cleaned.replace("There are ", "") - return f"Pods look steady ({detail}); nothing looks stuck or unhealthy." - if key == "availability": - return cleaned + " That is the kind of stability I like to see." - if key in ("cpu", "ram"): - suffix = ( - " If you're chasing hotspots, that's the node I'd watch first." - if key == "cpu" - else " That box is carrying the heaviest memory load right now." - ) - return cleaned + "." + suffix - return cleaned + "." - - -def _insight_prefix(prompt: str) -> str: - q = normalize_query(prompt) - if "coolest" in q: - return "If I had to pick the coolest detail, I'd say " - if "favorite" in q or "favourite" in q: - return "My favorite detail is " - if "trivia" in q: - return "A bit of trivia I like: " - if "most interesting" in q: - return "The most interesting detail to me is " - if any(word in q for word in ("another", "else", "different", "other")): - return "Another interesting detail: " - if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): - return "What stands out to me is that " - if any(word in q for word in ("interesting", "notable", "fun", "cool")): - return "One thing I'd call out is " - return "" - - def cluster_overview_answer( prompt: str, *, @@ -2784,7 +2583,7 @@ def _open_ended_system() -> str: "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible. " "Do not invent numbers or facts. " - "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100)." + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)." ) @@ -2809,263 +2608,284 @@ def _ollama_call_safe( def _candidate_note(candidate: dict[str, Any]) -> str: - claim = str(candidate.get("claim") or candidate.get("summary") or "") + claim = str(candidate.get("focus") or candidate.get("answer") or "") return claim[:160] + ("…" if len(claim) > 160 else "") def _ensure_scores(answer: str) -> str: text = answer.strip() lines = [line for line in text.splitlines() if line.strip()] - has_relevance = any(line.lower().startswith("relevance:") for line in lines) - has_satisfaction = any(line.lower().startswith("satisfaction:") for line in lines) - has_confidence = any("confidence:" in line.lower() for line in lines) + has_relevance = any(line.lower().startswith("relevance") for line in lines) + has_satisfaction = any(line.lower().startswith("satisfaction") for line in lines) + has_confidence = any(line.lower().startswith("confidence") for line in lines) + has_risk = any(line.lower().startswith("hallucinationrisk") for line in lines) if not has_confidence: lines.append("Confidence: medium") if not has_relevance: lines.append("Relevance: 70") if not has_satisfaction: lines.append("Satisfaction: 70") + if not has_risk: + lines.append("HallucinationRisk: low") return "\n".join(lines) +def _open_ended_plan( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + count: int, + state: ThoughtState | None, +) -> list[dict[str, Any]]: + if state: + state.update("planning", step=1, note="mapping angles") + count = max(1, count) + prompt_text = ( + "Analyze the question and propose up to " + f"{count} distinct answer angles that can be supported by the fact pack. " + "Keep them diverse (e.g., metrics, hardware, workload placement, recent changes). " + "If the question is subjective, propose at least one angle that surfaces a standout detail. " + "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"priority\":1-5}]}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) + angles = result.get("angles") if isinstance(result, dict) else None + cleaned: list[dict[str, Any]] = [] + seen: set[str] = set() + if isinstance(angles, list): + for item in angles: + if not isinstance(item, dict): + continue + focus = str(item.get("focus") or "").strip() + if not focus or focus.lower() in seen: + continue + seen.add(focus.lower()) + priority = item.get("priority") + if not isinstance(priority, (int, float)): + priority = 3 + cleaned.append( + { + "focus": focus, + "reason": str(item.get("reason") or ""), + "priority": int(max(1, min(5, priority))), + } + ) + if not cleaned: + cleaned = [{"focus": "Direct answer", "reason": "Default fallback", "priority": 3}] + cleaned.sort(key=lambda item: item.get("priority", 3), reverse=True) + if state: + state.update("planning", step=1, note=_candidate_note(cleaned[0])) + return cleaned + + +def _normalize_score(value: Any, *, default: int = 60) -> int: + if isinstance(value, (int, float)): + return int(max(0, min(100, value))) + return default + + +def _confidence_score(value: Any) -> int: + text = str(value or "").strip().lower() + if text.startswith("high"): + return 85 + if text.startswith("low"): + return 35 + return 60 + + +def _risk_penalty(value: Any) -> int: + text = str(value or "").strip().lower() + if text.startswith("high"): + return 20 + if text.startswith("medium"): + return 10 + return 0 + + +def _open_ended_candidate( + prompt: str, + *, + focus: str, + fact_pack: str, + history_lines: list[str], + state: ThoughtState | None, + step: int, +) -> dict[str, Any]: + if state: + state.update("drafting", step=step, note=focus) + prompt_text = ( + "Using ONLY the fact pack, answer the question focusing on this angle: " + f"{focus}. " + "Write 2-4 sentences in plain prose (not a list). " + "If you infer, label it as inference. " + "Return JSON: {\"answer\":\"...\",\"confidence\":\"high|medium|low\"," + "\"relevance\":0-100,\"satisfaction\":0-100,\"risk\":\"low|medium|high\"}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) + if not isinstance(result, dict): + result = {} + answer = str(result.get("answer") or "").strip() + if not answer: + answer = "I don't have enough data to answer that from the current snapshot." + candidate = { + "focus": focus, + "answer": answer, + "confidence": result.get("confidence", "medium"), + "relevance": _normalize_score(result.get("relevance"), default=60), + "satisfaction": _normalize_score(result.get("satisfaction"), default=60), + "risk": result.get("risk", "medium"), + } + candidate["score"] = _candidate_score(candidate) + return candidate + + +def _candidate_score(candidate: dict[str, Any]) -> float: + relevance = _normalize_score(candidate.get("relevance"), default=60) + satisfaction = _normalize_score(candidate.get("satisfaction"), default=60) + confidence = _confidence_score(candidate.get("confidence")) + score = relevance * 0.45 + satisfaction * 0.35 + confidence * 0.2 + return score - _risk_penalty(candidate.get("risk")) + + +def _select_candidates(candidates: list[dict[str, Any]], *, count: int) -> list[dict[str, Any]]: + if not candidates: + return [] + ranked = sorted(candidates, key=lambda item: item.get("score", 0), reverse=True) + picked: list[dict[str, Any]] = [] + seen_focus: set[str] = set() + for item in ranked: + focus = str(item.get("focus") or "").strip().lower() + if focus and focus in seen_focus: + continue + picked.append(item) + if focus: + seen_focus.add(focus) + if len(picked) >= count: + break + return picked or ranked[:count] + + +def _open_ended_synthesize( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + candidates: list[dict[str, Any]], + state: ThoughtState | None, + step: int, +) -> str: + if state: + state.update("synthesizing", step=step, note="composing answer") + synth_prompt = ( + "Compose the final answer to the question using the candidate answers below. " + "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. " + "Use only the fact pack as evidence. " + "If you infer, label it as inference. " + "Avoid repeating the last response if possible. " + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), " + "HallucinationRisk (low|medium|high).\n" + f"Question: {prompt}\n" + f"Candidates: {json.dumps(candidates, ensure_ascii=False)}" + ) + context = _append_history_context(fact_pack, history_lines) + reply = _ollama_call_safe( + ("open", "synth"), + synth_prompt, + context=context, + fallback="I don't have enough data to answer that.", + system_override=_open_ended_system(), + ) + return _ensure_scores(reply) + + +def _open_ended_multi( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + mode: str, + state: ThoughtState | None = None, +) -> str: + angle_count = 2 if mode == "fast" else 4 + total_steps = 1 + angle_count + 2 + if state: + state.total_steps = total_steps + angles = _open_ended_plan( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + count=angle_count, + state=state, + ) + candidates: list[dict[str, Any]] = [] + step = 2 + for angle in angles[:angle_count]: + candidates.append( + _open_ended_candidate( + prompt, + focus=str(angle.get("focus") or "Direct answer"), + fact_pack=fact_pack, + history_lines=history_lines, + state=state, + step=step, + ) + ) + step += 1 + if state: + state.update("evaluating", step=step, note="ranking candidates") + selected = _select_candidates(candidates, count=1 if mode == "fast" else 2) + step += 1 + reply = _open_ended_synthesize( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + candidates=selected or candidates, + state=state, + step=step, + ) + if state: + state.update("done", step=total_steps) + return reply + + +def _open_ended_total_steps(mode: str) -> int: + angle_count = 2 if mode == "fast" else 4 + return 1 + angle_count + 2 + + def _open_ended_fast( prompt: str, *, fact_pack: str, history_lines: list[str], - fact_lines: list[str], - fact_meta: dict[str, dict[str, Any]], - tags_available: set[str], - history_tags: set[str], state: ThoughtState | None = None, ) -> str: - if state: - state.update("planning", step=1) - analysis = _interpret_open_question( + return _open_ended_multi( prompt, fact_pack=fact_pack, history_lines=history_lines, - tags_available=tags_available, - avoid_tags=history_tags, + mode="fast", state=state, ) - candidates = _select_insights( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - state=state or ThoughtState(), - analysis=analysis, - fact_lines=fact_lines, - fact_meta=fact_meta, - avoid_tags=history_tags, - ) - prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)} - selected = _select_diverse_insights( - candidates, - preference=analysis.get("preference", "balanced"), - prefer_tags=prefer_tags, - avoid_tags=history_tags, - history_tags=history_tags, - fact_meta=fact_meta, - count=2, - ) - if state: - state.update("synthesizing", step=3) - synthesis_prompt = ( - "Use the question, fact pack, and selected insights to answer in 2-4 sentences. " - "Speak naturally, not as a list. " - "If the question is subjective, add a light opinion grounded in facts. " - "Avoid repeating the exact same observation as the most recent response if possible. " - "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" - f"Question: {prompt}\n" - f"Selected: {json.dumps(selected, ensure_ascii=False)}" - ) - context = _append_history_context(fact_pack, history_lines) - reply = _ollama_call_safe( - ("fast", "open"), - synthesis_prompt, - context=context, - fallback="I don't have enough data to answer that.", - system_override=_open_ended_system(), - ) - return _ensure_scores(reply) - - -def _interpret_open_question( - prompt: str, - *, - fact_pack: str, - history_lines: list[str], - tags_available: set[str], - avoid_tags: set[str], - state: ThoughtState | None = None, -) -> dict[str, Any]: - tags_list = ", ".join(sorted(tags_available)) if tags_available else "none" - avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" - prompt_text = ( - "Analyze the question against the fact pack. " - "Return JSON: {\"focus\":\"...\",\"preference\":\"balanced|novelty|utilization|stability|risk\"," - "\"tags\":[\"...\"] ,\"notes\":\"...\"}. " - "If the question implies interesting/unique/unconventional/cool, set preference to novelty " - "and prefer dynamic tags (utilization/pods/database/availability) when possible. " - f"Use only these tags if relevant: {tags_list}. Avoid tags: {avoid_list}. " - "Use only the fact pack." - ) - context = _append_history_context(fact_pack, history_lines) - analysis = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) - if not isinstance(analysis, dict): - analysis = {"focus": "cluster snapshot", "preference": "balanced", "notes": "", "tags": []} - preference = analysis.get("preference") or "balanced" - if preference not in ("balanced", "novelty", "utilization", "stability", "risk"): - preference = "balanced" - analysis["preference"] = preference - analysis.setdefault("focus", "cluster snapshot") - analysis.setdefault("notes", "") - tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else [] - clean_tags = {t for t in tags if isinstance(t, str)} - analysis["tags"] = sorted(clean_tags & tags_available) - if state: - state.update("planning", step=1, note=str(analysis.get("focus") or "")) - return analysis - - -def _select_insights( - prompt: str, - *, - fact_pack: str, - history_lines: list[str], - state: ThoughtState, - analysis: dict[str, Any], - fact_lines: list[str], - fact_meta: dict[str, dict[str, Any]], - avoid_tags: set[str], -) -> list[dict[str, Any]]: - preferred_tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else [] - prefer_list = ", ".join(sorted({t for t in preferred_tags if isinstance(t, str)})) - avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" - available_list = ", ".join(sorted({t for t in _ALLOWED_INSIGHT_TAGS})) - insight_prompt = ( - "From the fact pack, select 3-5 candidate insights that could answer the question. " - "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"]," - "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\",\"tags\":[\"...\"]}]}. " - f"Available tags: {available_list}. Prefer tags: {prefer_list or 'none'}. Avoid tags: {avoid_list}. " - "Use only the fact pack and provided tags." - ) - state.update("drafting candidates", step=2) - context = _append_history_context(fact_pack, history_lines) - result = _ollama_json_call(insight_prompt + f" Question: {prompt}", context=context) - insights = result.get("insights") if isinstance(result, dict) else None - if not isinstance(insights, list): - insights = [] - cleaned: list[dict[str, Any]] = [] - for item in insights: - if not isinstance(item, dict): - continue - if not item.get("summary") or not item.get("fact_ids"): - continue - tags = _insight_tags(item, fact_meta) - item["tags"] = sorted(tags) - cleaned.append(item) - state.update("drafting candidates", step=2, note=_candidate_note(item)) - seeds = _seed_insights(fact_lines, fact_meta) - for seed in seeds: - cleaned.append(seed) - return cleaned - - -def _score_insight(insight: dict[str, Any], preference: str) -> float: - relevance = insight.get("relevance") if isinstance(insight.get("relevance"), (int, float)) else 0.0 - novelty = insight.get("novelty") if isinstance(insight.get("novelty"), (int, float)) else 0.0 - if preference == "novelty": - return 0.4 * relevance + 0.6 * novelty - if preference == "utilization": - return 0.7 * relevance + 0.3 * novelty - if preference == "stability": - return 0.7 * relevance + 0.3 * novelty - if preference == "risk": - return 0.6 * relevance + 0.4 * novelty - return 0.6 * relevance + 0.4 * novelty def _open_ended_deep( prompt: str, *, fact_pack: str, - fact_ids: set[str], history_lines: list[str], - fact_lines: list[str], - fact_meta: dict[str, dict[str, Any]], - tags_available: set[str], - history_tags: set[str], state: ThoughtState | None = None, ) -> str: - state = state or ThoughtState() - if not fact_ids: - return _ensure_scores("I don't have enough data to answer that.") - state.total_steps = 7 - analysis = _interpret_open_question( + return _open_ended_multi( prompt, fact_pack=fact_pack, history_lines=history_lines, - tags_available=tags_available, - avoid_tags=history_tags, + mode="deep", state=state, ) - candidates = _select_insights( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - state=state, - analysis=analysis, - fact_lines=fact_lines, - fact_meta=fact_meta, - avoid_tags=history_tags, - ) - state.update("verifying", step=3, note="scoring insights") - filtered: list[dict[str, Any]] = [] - for cand in candidates: - cites = cand.get("fact_ids") if isinstance(cand.get("fact_ids"), list) else [] - if cites and not all(cite in fact_ids for cite in cites): - continue - filtered.append(cand) - if not filtered: - filtered = candidates - - preference = analysis.get("preference", "balanced") - prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)} - top = _select_diverse_insights( - filtered, - preference=preference, - prefer_tags=prefer_tags, - avoid_tags=history_tags, - history_tags=history_tags, - fact_meta=fact_meta, - count=2, - ) - state.update("synthesizing", step=4, note="composing response") - synth_prompt = ( - "Use the question, fact pack, and selected insights to craft a concise answer. " - "Write 2-4 sentences. Explain why the selected insights stand out. " - "If the question is subjective, include a light opinion grounded in facts. " - "Avoid repeating the same observation as the last response if possible. " - "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" - f"Question: {prompt}\n" - f"Interpretation: {json.dumps(analysis, ensure_ascii=False)}\n" - f"Recent tags: {', '.join(sorted(history_tags)) if history_tags else 'none'}\n" - f"Selected: {json.dumps(top, ensure_ascii=False)}" - ) - context = _append_history_context(fact_pack, history_lines) - reply = _ollama_call_safe( - ("deep", "open"), - synth_prompt, - context=context, - fallback="I don't have enough data to answer that.", - system_override=_open_ended_system(), - ) - state.update("done", step=7) - return _ensure_scores(reply) - def open_ended_answer( prompt: str, @@ -3081,30 +2901,17 @@ def open_ended_answer( if not lines: return _ensure_scores("I don't have enough data to answer that.") fact_pack = _fact_pack_text(lines) - fact_ids = {f"F{i+1}" for i in range(len(lines))} - fact_meta = _fact_pack_meta(lines) - tags_available = {tag for entry in fact_meta.values() for tag in entry.get("tags", [])} - history_tags = _history_tags(history_lines) if mode == "fast": return _open_ended_fast( prompt, fact_pack=fact_pack, history_lines=history_lines, - fact_lines=lines, - fact_meta=fact_meta, - tags_available=tags_available, - history_tags=history_tags, state=state, ) return _open_ended_deep( prompt, fact_pack=fact_pack, - fact_ids=fact_ids, history_lines=history_lines, - fact_lines=lines, - fact_meta=fact_meta, - tags_available=tags_available, - history_tags=history_tags, state=state, ) @@ -3175,12 +2982,12 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): workloads=workloads, ) followup = _is_followup_query(cleaned) - cluster_query = ( - _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) - or _knowledge_intent(cleaned) - or _is_subjective_query(cleaned) - or (history_cluster and followup) - ) + cleaned_q = normalize_query(cleaned) + cluster_affinity = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) + subjective = _is_subjective_query(cleaned) + followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS) + contextual = history_cluster and (followup or followup_affinity) + cluster_query = cluster_affinity or contextual context = "" if cluster_query: context = build_context( @@ -3608,7 +3415,7 @@ def open_ended_with_thinking( ) -> str: result: dict[str, str] = {"reply": ""} done = threading.Event() - total_steps = 4 if mode == "fast" else 7 + total_steps = _open_ended_total_steps(mode) state = ThoughtState(total_steps=total_steps) def worker(): @@ -3722,12 +3529,12 @@ def sync_loop(token: str, room_id: str): workloads=workloads, ) followup = _is_followup_query(cleaned_body) - cluster_query = ( - _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) - or _knowledge_intent(cleaned_body) - or _is_subjective_query(cleaned_body) - or (history_cluster and followup) - ) + cleaned_q = normalize_query(cleaned_body) + cluster_affinity = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) + subjective = _is_subjective_query(cleaned_body) + followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS) + contextual = history_cluster and (followup or followup_affinity) + cluster_query = cluster_affinity or contextual context = "" if cluster_query: context = build_context( From ca7a08e791ca1dcc286d0b44c41f616ef23a0bdc Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:28:37 -0300 Subject: [PATCH 369/416] monitoring: fix grafana smtp from address --- services/monitoring/helmrelease.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 6185e59..78eaf3c 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -340,7 +340,7 @@ spec: GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer" GF_SMTP_ENABLED: "true" GF_SMTP_HOST: "smtp.postmarkapp.com:587" - GF_SMTP_FROM: "no-reply-grafana@bstein.dev" + GF_SMTP_FROM_ADDRESS: "no-reply-grafana@bstein.dev" GF_SMTP_FROM_NAME: "Atlas Grafana" GRAFANA_ALERT_EMAILS: "brad@bstein.dev" GF_SECURITY_ALLOW_EMBEDDING: "true" From fc10eed704d4288c3cce20325526ff84059572c2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:32:25 -0300 Subject: [PATCH 370/416] atlasbot: fix score formatting --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 17e2cb2..7ad44d4 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-75 + checksum/atlasbot-configmap: manual-atlasbot-76 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 0668521..9ecd06d 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2614,7 +2614,7 @@ def _candidate_note(candidate: dict[str, Any]) -> str: def _ensure_scores(answer: str) -> str: text = answer.strip() - lines = [line for line in text.splitlines() if line.strip()] + lines = [line.strip() for line in text.splitlines() if line.strip()] has_relevance = any(line.lower().startswith("relevance") for line in lines) has_satisfaction = any(line.lower().startswith("satisfaction") for line in lines) has_confidence = any(line.lower().startswith("confidence") for line in lines) From 23533e08ee43ceb5aa3c2d7357a8653ed390f7ff Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:44:49 -0300 Subject: [PATCH 371/416] atlasbot: refine cluster intent handling --- services/comms/scripts/atlasbot/bot.py | 92 ++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 5 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 9ecd06d..f85b81a 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -152,6 +152,16 @@ CLUSTER_HINT_WORDS = { "deployment", "daemonset", "statefulset", + "snapshot", + "anomaly", + "anomalies", + "monitor", + "monitoring", + "runbook", + "runbooks", + "documentation", + "docs", + "playbook", "grafana", "victoria", "prometheus", @@ -203,6 +213,12 @@ _INSIGHT_HINT_WORDS = { "favorite", "favourite", "trivia", + "anomaly", + "anomalies", + "monitor", + "monitoring", + "alert", + "alerts", "stand out", "stands out", } @@ -532,7 +548,14 @@ def _humanize_rate(value: str, *, unit: str) -> str: return f"{val:.2f} B/s" def _has_any(text: str, phrases: tuple[str, ...]) -> bool: - return any(p in text for p in phrases) + for phrase in phrases: + if " " in phrase: + if phrase in text: + return True + else: + if re.search(rf"\\b{re.escape(phrase)}\\b", text): + return True + return False def _detect_operation(q: str) -> str | None: if _has_any(q, OPERATION_HINTS["top"]): @@ -552,6 +575,8 @@ def _detect_metric(q: str) -> str | None: part = part.strip() if len(part) >= 2: expanded.add(part) + if part.endswith("s") and len(part) >= 4: + expanded.add(part[:-1]) tokens = expanded for metric, phrases in METRIC_HINTS.items(): for phrase in phrases: @@ -565,6 +590,8 @@ def _detect_metric(q: str) -> str | None: def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: include: set[str] = set() exclude: set[str] = set() + if any(term in q for term in ("gpu", "gpus", "accelerator", "accelerators", "cuda", "nvidia")): + include.add("jetson") rpi_specific = any( phrase in q for phrase in ( @@ -1287,6 +1314,10 @@ def snapshot_metric_answer( failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") status_terms = ("running", "pending", "failed", "succeeded", "completed") + if "not running" in q or "not in running" in q or "non running" in q: + parts = [v for v in (pending, failed, succeeded) if isinstance(v, (int, float))] + if parts: + return _format_confidence(f"Pods not running: {sum(parts):.0f}.", "high") if sum(1 for term in status_terms if term in q) > 1: parts = [] if running is not None: @@ -1350,6 +1381,8 @@ def structured_answer( op = "top" entity = _detect_entity(q) include_hw, exclude_hw = _detect_hardware_filters(q) + if entity is None and (include_hw or exclude_hw): + entity = "node" nodes_in_query = _extract_titan_nodes(q) only_workers = "worker" in q or "workers" in q role_filters = _detect_role_filters(q) @@ -1385,6 +1418,20 @@ def structured_answer( if hw_line: return _format_confidence(hw_line, "medium") + if ( + entity == "node" + and any(term in q for term in ("arm64", "amd64")) + and any(term in q for term in ("mostly", "majority", "more")) + ): + arm64_count = len([n for n in inventory if n.get("arch") == "arm64"]) + amd64_count = len([n for n in inventory if n.get("arch") == "amd64"]) + if arm64_count or amd64_count: + majority = "arm64" if arm64_count >= amd64_count else "amd64" + return _format_confidence( + f"arm64 nodes: {arm64_count}, amd64 nodes: {amd64_count}. Mostly {majority}.", + "high", + ) + if op == "top" and metric is None and not any(word in q for word in ("hardware", "architecture", "class")): metric = "cpu" @@ -1491,6 +1538,27 @@ def structured_answer( ) if op == "count": + if only_workers and "ready" in q and ("total" in q or "vs" in q or "versus" in q): + total_workers = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=True, + only_ready=None, + nodes_in_query=nodes_in_query, + ) + ready_workers = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=True, + only_ready=True, + nodes_in_query=nodes_in_query, + ) + return _format_confidence( + f"Worker nodes ready: {len(ready_workers)} / {len(total_workers)} total.", + "high", + ) if expected_workers and ("expected" in q or "should" in q): missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." @@ -1711,6 +1779,15 @@ def _doc_intent(query: str) -> bool: "how to", "instructions", "playbook", + "next step", + "next steps", + "what should", + "what do i", + "what to do", + "troubleshoot", + "triage", + "recover", + "remediate", ) ) @@ -2615,10 +2692,13 @@ def _candidate_note(candidate: dict[str, Any]) -> str: def _ensure_scores(answer: str) -> str: text = answer.strip() lines = [line.strip() for line in text.splitlines() if line.strip()] - has_relevance = any(line.lower().startswith("relevance") for line in lines) - has_satisfaction = any(line.lower().startswith("satisfaction") for line in lines) - has_confidence = any(line.lower().startswith("confidence") for line in lines) - has_risk = any(line.lower().startswith("hallucinationrisk") for line in lines) + def _score_key(line: str) -> str: + cleaned = line.strip().lstrip("-•* ").strip() + return cleaned.lower() + has_relevance = any(_score_key(line).startswith("relevance") for line in lines) + has_satisfaction = any(_score_key(line).startswith("satisfaction") for line in lines) + has_confidence = any(_score_key(line).startswith("confidence") for line in lines) + has_risk = any(_score_key(line).startswith("hallucinationrisk") for line in lines) if not has_confidence: lines.append("Confidence: medium") if not has_relevance: @@ -3004,6 +3084,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): _is_subjective_query(cleaned) or _knowledge_intent(cleaned) or _is_overview_query(cleaned) + or _doc_intent(cleaned) ) if open_ended: answer = open_ended_answer( @@ -3558,6 +3639,7 @@ def sync_loop(token: str, room_id: str): _is_subjective_query(cleaned_body) or _knowledge_intent(cleaned_body) or _is_overview_query(cleaned_body) + or _doc_intent(cleaned_body) ) if open_ended: reply = open_ended_with_thinking( From 24b0ac78c4720656127404769bb67f94b823b118 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:45:17 -0300 Subject: [PATCH 372/416] chore: bump atlasbot config checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7ad44d4..01aebef 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-76 + checksum/atlasbot-configmap: manual-atlasbot-77 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From b9b25565a26522c1c14600ca34a5af6574302031 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:55:00 -0300 Subject: [PATCH 373/416] atlasbot: tighten scoring and readiness logic --- services/comms/scripts/atlasbot/bot.py | 97 +++++++++++++++++++++----- 1 file changed, 81 insertions(+), 16 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f85b81a..29f5375 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1297,7 +1297,7 @@ def snapshot_metric_answer( parts: list[str] = [] if used is not None and max_conn is not None: free = max_conn - used - if any(word in q for word in ("free", "available", "remaining")): + if any(word in q for word in ("free", "available", "remaining", "remain", "left")): parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max ({free:.0f} free).") else: parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.") @@ -1387,13 +1387,23 @@ def structured_answer( only_workers = "worker" in q or "workers" in q role_filters = _detect_role_filters(q) only_ready: bool | None = None - if "not ready" in q or "unready" in q or "down" in q or "missing" in q: + if ( + "not ready" in q + or "notready" in q + or "not-ready" in q + or "unready" in q + or "down" in q + or "missing" in q + ): only_ready = False elif "ready" in q: only_ready = True if entity == "node" and only_ready is not None and op != "count": op = "status" + if entity == "node" and only_ready is not None and op == "count": + if not any(term in q for term in ("how many", "count", "number")): + op = "status" if not op and entity == "node": op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count" @@ -2692,22 +2702,67 @@ def _candidate_note(candidate: dict[str, Any]) -> str: def _ensure_scores(answer: str) -> str: text = answer.strip() lines = [line.strip() for line in text.splitlines() if line.strip()] + score_map: dict[str, str] = {} + body_lines: list[str] = [] + def _score_key(line: str) -> str: cleaned = line.strip().lstrip("-•* ").strip() return cleaned.lower() - has_relevance = any(_score_key(line).startswith("relevance") for line in lines) - has_satisfaction = any(_score_key(line).startswith("satisfaction") for line in lines) - has_confidence = any(_score_key(line).startswith("confidence") for line in lines) - has_risk = any(_score_key(line).startswith("hallucinationrisk") for line in lines) - if not has_confidence: - lines.append("Confidence: medium") - if not has_relevance: - lines.append("Relevance: 70") - if not has_satisfaction: - lines.append("Satisfaction: 70") - if not has_risk: - lines.append("HallucinationRisk: low") - return "\n".join(lines) + + def _extract_value(line: str) -> str: + cleaned = line.strip().lstrip("-•* ").strip() + if ":" in cleaned: + return cleaned.split(":", 1)[1].strip() + parts = cleaned.split() + return parts[1] if len(parts) > 1 else "" + + def _record_score(key: str, value: str): + if not value: + return + score_map.setdefault(key, value) + + for line in lines: + cleaned = line.strip().lstrip("-•* ").strip() + lowered = cleaned.lower() + if lowered.startswith("confidence,") or ( + "confidence" in lowered and "relevance" in lowered and "satisfaction" in lowered + ): + for key in ("confidence", "relevance", "satisfaction"): + match = re.search(rf"{key}\\s*[:=]?\\s*(\\d{{1,3}}|high|medium|low)", lowered) + if match: + _record_score(key, match.group(1)) + risk_match = re.search(r"hallucination\\s*risk\\s*[:=]?\\s*(low|medium|high)", lowered) + if risk_match: + _record_score("hallucinationrisk", risk_match.group(1)) + continue + if lowered.startswith("confidence"): + _record_score("confidence", _extract_value(cleaned)) + continue + if lowered.startswith("relevance"): + _record_score("relevance", _extract_value(cleaned)) + continue + if lowered.startswith("satisfaction"): + _record_score("satisfaction", _extract_value(cleaned)) + continue + if lowered.replace(" ", "").startswith("hallucinationrisk") or lowered.startswith( + "hallucination risk" + ): + _record_score("hallucinationrisk", _extract_value(cleaned)) + continue + body_lines.append(line) + + confidence = score_map.get("confidence") or "medium" + relevance = score_map.get("relevance") or "70" + satisfaction = score_map.get("satisfaction") or "70" + risk = score_map.get("hallucinationrisk") or "low" + + final_lines = body_lines + [ + f"Confidence: {confidence}", + f"Relevance: {relevance}", + f"Satisfaction: {satisfaction}", + f"HallucinationRisk: {risk}", + ] + return "\n".join(final_lines) def _open_ended_plan( @@ -2799,7 +2854,8 @@ def _open_ended_candidate( f"{focus}. " "Write 2-4 sentences in plain prose (not a list). " "If you infer, label it as inference. " - "Return JSON: {\"answer\":\"...\",\"confidence\":\"high|medium|low\"," + "List which fact pack IDs you used. " + "Return JSON: {\"answer\":\"...\",\"facts_used\":[\"F1\"],\"confidence\":\"high|medium|low\"," "\"relevance\":0-100,\"satisfaction\":0-100,\"risk\":\"low|medium|high\"}." ) context = _append_history_context(fact_pack, history_lines) @@ -2809,9 +2865,13 @@ def _open_ended_candidate( answer = str(result.get("answer") or "").strip() if not answer: answer = "I don't have enough data to answer that from the current snapshot." + facts_used = result.get("facts_used") + if not isinstance(facts_used, list): + facts_used = [] candidate = { "focus": focus, "answer": answer, + "facts_used": facts_used, "confidence": result.get("confidence", "medium"), "relevance": _normalize_score(result.get("relevance"), default=60), "satisfaction": _normalize_score(result.get("satisfaction"), default=60), @@ -2826,6 +2886,8 @@ def _candidate_score(candidate: dict[str, Any]) -> float: satisfaction = _normalize_score(candidate.get("satisfaction"), default=60) confidence = _confidence_score(candidate.get("confidence")) score = relevance * 0.45 + satisfaction * 0.35 + confidence * 0.2 + if not candidate.get("facts_used"): + score -= 5 return score - _risk_penalty(candidate.get("risk")) @@ -2863,6 +2925,9 @@ def _open_ended_synthesize( "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. " "Use only the fact pack as evidence. " "If you infer, label it as inference. " + "Do not claim nodes are missing or not ready unless the fact pack explicitly lists " + "nodes_not_ready or expected_workers_missing. " + "Keep the tone conversational and answer the user's intent directly. " "Avoid repeating the last response if possible. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), " "HallucinationRisk (low|medium|high).\n" From 380aae3b2c75220cbbbe8de63627c4b49b862ff0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:55:24 -0300 Subject: [PATCH 374/416] chore: bump atlasbot checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 01aebef..a06e628 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-77 + checksum/atlasbot-configmap: manual-atlasbot-78 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 4b468b0f97846e0de41dcba10287f6682da43ec4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:01:51 -0300 Subject: [PATCH 375/416] atlasbot: fix word boundary detection --- services/comms/scripts/atlasbot/bot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 29f5375..77868f1 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -553,7 +553,7 @@ def _has_any(text: str, phrases: tuple[str, ...]) -> bool: if phrase in text: return True else: - if re.search(rf"\\b{re.escape(phrase)}\\b", text): + if re.search(rf"\b{re.escape(phrase)}\b", text): return True return False From 67ca0d451dcb48401b41ec1aada975e5362b2919 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:02:22 -0300 Subject: [PATCH 376/416] chore: bump atlasbot checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index a06e628..530fb40 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-78 + checksum/atlasbot-configmap: manual-atlasbot-79 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 7b1c891e70649529ae7de93c3a716c20b5991d0d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:16:53 -0300 Subject: [PATCH 377/416] atlasbot: improve metric detection and counts --- services/comms/scripts/atlasbot/bot.py | 81 +++++++++++++++++++++----- 1 file changed, 68 insertions(+), 13 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 77868f1..eca5fef 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -120,6 +120,7 @@ OPERATION_HINTS = { "count": ("how many", "count", "number", "total"), "list": ("list", "which", "what are", "show", "names"), "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum", "busiest", "busy"), + "bottom": ("lowest", "least", "minimum", "min", "smallest"), "status": ("ready", "not ready", "unready", "down", "missing", "status"), } @@ -568,6 +569,14 @@ def _detect_operation(q: str) -> str | None: return None def _detect_metric(q: str) -> str | None: + q = normalize_query(q) + if _has_any(q, ("disk", "storage")): + return "io" + if _has_any(q, ("io",)) and not _has_any(q, METRIC_HINTS["net"]): + return "io" + for metric, phrases in METRIC_HINTS.items(): + if _has_any(q, phrases): + return metric tokens = set(_tokens(q)) expanded: set[str] = set(tokens) for token in list(tokens): @@ -1237,6 +1246,34 @@ def _node_usage_top( return None +def _node_usage_bottom( + usage: list[dict[str, Any]], + *, + allowed_nodes: set[str] | None, +) -> tuple[str, float] | None: + best_node: str | None = None + best_val: float | None = None + for item in usage: + if not isinstance(item, dict): + continue + node = item.get("node") + if not node or not isinstance(node, str): + continue + if allowed_nodes and node not in allowed_nodes: + continue + value = item.get("value") + try: + numeric = float(value) + except (TypeError, ValueError): + continue + if best_val is None or numeric < best_val: + best_val = numeric + best_node = node + if best_node and best_val is not None: + return best_node, best_val + return None + + def snapshot_metric_answer( prompt: str, *, @@ -1267,18 +1304,20 @@ def snapshot_metric_answer( ) allowed_nodes = {node["name"] for node in filtered} if filtered else None - if metric in {"cpu", "ram", "net", "io"} and op in {"top", "status", None}: + if metric in {"cpu", "ram", "net", "io"} and op in {"top", "bottom", "status", None}: usage = metrics.get("node_usage", {}).get(metric, []) - top = _node_usage_top(usage, allowed_nodes=allowed_nodes) - if top: - node, val = top + pick = _node_usage_bottom if op == "bottom" else _node_usage_top + chosen = pick(usage, allowed_nodes=allowed_nodes) + if chosen: + node, val = chosen percent = metric in {"cpu", "ram"} value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"}) scope = "" if include_hw: scope = f" among {' and '.join(sorted(include_hw))}" - answer = f"Hottest node{scope}: {node} ({value})." - if allowed_nodes and len(allowed_nodes) != len(inventory): + label = "Lowest" if op == "bottom" else "Hottest" + answer = f"{label} node{scope}: {node} ({value})." + if allowed_nodes and len(allowed_nodes) != len(inventory) and op != "bottom": overall = _node_usage_top(usage, allowed_nodes=None) if overall and overall[0] != node: overall_val = _format_metric_value( @@ -1314,6 +1353,10 @@ def snapshot_metric_answer( failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") status_terms = ("running", "pending", "failed", "succeeded", "completed") + if "total" in q or "sum" in q: + values = [v for v in (running, pending, failed, succeeded) if isinstance(v, (int, float))] + if values: + return _format_confidence(f"Total pods: {sum(values):.0f}.", "high") if "not running" in q or "not in running" in q or "non running" in q: parts = [v for v in (pending, failed, succeeded) if isinstance(v, (int, float))] if parts: @@ -1468,7 +1511,8 @@ def structured_answer( node, val = _primary_series_metric(res) if node and val is not None: percent = _metric_expr_uses_percent(entry) - value_fmt = _format_metric_value(val or "", percent=percent) + rate = metric in {"net", "io"} + value_fmt = _format_metric_value(val or "", percent=percent, rate=rate) metric_label = (metric or "").upper() label = f"{metric_label} node" if metric_label else "node" answer = f"Hottest {label}: {node} ({value_fmt})." @@ -1495,7 +1539,8 @@ def structured_answer( scoped_node, scoped_val = _primary_series_metric(res) if base_node and scoped_node and base_node != scoped_node: percent = _metric_expr_uses_percent(entry) - base_val_fmt = _format_metric_value(base_val or "", percent=percent) + rate = metric in {"net", "io"} + base_val_fmt = _format_metric_value(base_val or "", percent=percent, rate=rate) overall_note = f" Overall hottest node: {base_node} ({base_val_fmt})." return _format_confidence(f"Among {scope} nodes, {answer}{overall_note}", "high") return _format_confidence(answer, "high") @@ -1525,9 +1570,14 @@ def structured_answer( names = [node["name"] for node in filtered] if op == "status": + scope_label = "nodes" + if include_hw: + scope_label = f"{' and '.join(sorted(include_hw))} nodes" + elif only_workers: + scope_label = "worker nodes" if "missing" in q and ("ready" in q or "readiness" in q): return _format_confidence( - "Not ready nodes: " + (", ".join(names) if names else "none") + ".", + f"Not ready {scope_label}: " + (", ".join(names) if names else "none") + ".", "high", ) if "missing" in q and expected_workers: @@ -1538,16 +1588,21 @@ def structured_answer( ) if only_ready is False: return _format_confidence( - "Not ready nodes: " + (", ".join(names) if names else "none") + ".", + f"Not ready {scope_label}: " + (", ".join(names) if names else "none") + ".", "high", ) if only_ready is True: return _format_confidence( - f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + ".", + f"Ready {scope_label} ({len(names)}): " + (", ".join(names) if names else "none") + ".", "high", ) if op == "count": + scope_label = "nodes" + if include_hw: + scope_label = f"{' and '.join(sorted(include_hw))} nodes" + elif only_workers: + scope_label = "worker nodes" if only_workers and "ready" in q and ("total" in q or "vs" in q or "versus" in q): total_workers = _inventory_filter( inventory, @@ -1576,9 +1631,9 @@ def structured_answer( msg += f" Missing: {', '.join(missing)}." return _format_confidence(msg, "high") if only_ready is True: - return _format_confidence(f"Ready nodes: {len(names)}.", "high") + return _format_confidence(f"Ready {scope_label}: {len(names)}.", "high") if only_ready is False: - return _format_confidence(f"Not ready nodes: {len(names)}.", "high") + return _format_confidence(f"Not ready {scope_label}: {len(names)}.", "high") if not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters): return _format_confidence(f"Atlas has {len(names)} nodes.", "high") return _format_confidence(f"Matching nodes: {len(names)}.", "high") From 19d10ce5858d28351c8729a7a0ddfbede736cf30 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:17:23 -0300 Subject: [PATCH 378/416] chore: bump atlasbot checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 530fb40..94eeea7 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-79 + checksum/atlasbot-configmap: manual-atlasbot-80 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From c5a7eece35f6d374e1b7a223840df9f1b2924413 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:23:42 -0300 Subject: [PATCH 379/416] monitoring: tune cpu and maintenance alerts --- services/monitoring/grafana-alerting-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index 8713d3d..d97db15 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -145,7 +145,7 @@ data: model: intervalMs: 60000 maxDataPoints: 43200 - expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] + expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\") legendFormat: '{{instance}}' datasource: type: prometheus @@ -175,9 +175,9 @@ data: type: last type: query noDataState: NoData - execErrState: Error + execErrState: NoData annotations: - summary: "{{ $labels.instance }} CPU >90% for 10m" + summary: "{{ $labels.node }} CPU >90% for 10m" labels: severity: warning - orgId: 1 @@ -297,7 +297,7 @@ data: to: 0 datasourceUid: atlas-vm model: - expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) + expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0) intervalMs: 60000 maxDataPoints: 43200 legendFormat: '{{cronjob}}' From f43acaa554e76bdaf7f7d82d7262cdd63dc03942 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:24:12 -0300 Subject: [PATCH 380/416] atlasbot: fix bottom ops and pod queries --- services/comms/scripts/atlasbot/bot.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index eca5fef..7f22ad5 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -163,6 +163,8 @@ CLUSTER_HINT_WORDS = { "documentation", "docs", "playbook", + "utilization", + "usage", "grafana", "victoria", "prometheus", @@ -561,8 +563,10 @@ def _has_any(text: str, phrases: tuple[str, ...]) -> bool: def _detect_operation(q: str) -> str | None: if _has_any(q, OPERATION_HINTS["top"]): return "top" + if _has_any(q, OPERATION_HINTS["bottom"]): + return "bottom" for op, phrases in OPERATION_HINTS.items(): - if op == "top": + if op in ("top", "bottom"): continue if _has_any(q, phrases): return op @@ -1353,6 +1357,11 @@ def snapshot_metric_answer( failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") status_terms = ("running", "pending", "failed", "succeeded", "completed") + if ("most pods" in q or ("most" in q and "pod" in q and "node" in q)) and not nodes_in_query: + return _format_confidence( + "I don't have per-node pod counts in the snapshot.", + "medium", + ) if "total" in q or "sum" in q: values = [v for v in (running, pending, failed, succeeded) if isinstance(v, (int, float))] if values: @@ -1363,13 +1372,13 @@ def snapshot_metric_answer( return _format_confidence(f"Pods not running: {sum(parts):.0f}.", "high") if sum(1 for term in status_terms if term in q) > 1: parts = [] - if running is not None: + if "running" in q and running is not None: parts.append(f"running {running:.0f}") - if pending is not None: + if "pending" in q and pending is not None: parts.append(f"pending {pending:.0f}") - if failed is not None: + if "failed" in q and failed is not None: parts.append(f"failed {failed:.0f}") - if succeeded is not None: + if ("succeeded" in q or "completed" in q) and succeeded is not None: parts.append(f"succeeded {succeeded:.0f}") if parts: return _format_confidence(f"Pods: {', '.join(parts)}.", "high") @@ -1461,7 +1470,12 @@ def structured_answer( if hw_line: return _format_confidence(hw_line, "high") - if entity == "node" and op == "status" and metric is None: + if ( + entity == "node" + and op == "status" + and metric is None + and not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters) + ): summary = _nodes_summary_line(inventory, snapshot) if summary: return _format_confidence(summary, "high") From 3a2bb1bac9d995fac6433dcae63ad61d8b17caaa Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:24:46 -0300 Subject: [PATCH 381/416] chore: bump atlasbot checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 94eeea7..6761287 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-80 + checksum/atlasbot-configmap: manual-atlasbot-81 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 9409c037c9db0b2aeb15f4ab6641eed30738854f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:29:46 -0300 Subject: [PATCH 382/416] monitoring: restart grafana for alerting reload --- services/monitoring/helmrelease.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 78eaf3c..6651738 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -286,6 +286,7 @@ spec: podAnnotations: vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "monitoring" + monitoring.bstein.dev/restart-rev: "1" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" vault.hashicorp.com/agent-inject-template-grafana-env.sh: | {{ with secret "kv/data/atlas/monitoring/grafana-admin" }} From b34f2abefd803438c95de07b4f18bbfa0b35d9c3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:34:11 -0300 Subject: [PATCH 383/416] monitoring: fix grafana alert exec state --- services/monitoring/grafana-alerting-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index d97db15..33ac739 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -175,7 +175,7 @@ data: type: last type: query noDataState: NoData - execErrState: NoData + execErrState: OK annotations: summary: "{{ $labels.node }} CPU >90% for 10m" labels: From a10050e4c71491c5daa98e8754f91850c9ca2f05 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:45:08 -0300 Subject: [PATCH 384/416] atlasbot: overhaul reasoning pipeline --- services/comms/atlasbot-deployment.yaml | 6 +- services/comms/scripts/atlasbot/bot.py | 405 +++++++++++++++++++----- 2 files changed, 336 insertions(+), 75 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 6761287..b08f20d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-81 + checksum/atlasbot-configmap: manual-atlasbot-82 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -83,6 +83,10 @@ spec: value: http://ollama.ai.svc.cluster.local:11434 - name: OLLAMA_MODEL value: qwen2.5:14b-instruct + - name: ATLASBOT_MODEL_FAST + value: qwen2.5:14b-instruct + - name: ATLASBOT_MODEL_DEEP + value: qwen2.5:14b-instruct - name: OLLAMA_FALLBACK_MODEL value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7f22ad5..7e6341e 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -17,6 +17,8 @@ ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") +MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "") +MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "") FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "") API_KEY = os.environ.get("CHAT_API_KEY", "") OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) @@ -372,6 +374,14 @@ def _detect_mode_from_body(body: str, *, default: str = "deep") -> str: return default +def _model_for_mode(mode: str) -> str: + if mode == "fast" and MODEL_FAST: + return MODEL_FAST + if mode == "deep" and MODEL_DEEP: + return MODEL_DEEP + return MODEL + + # Matrix HTTP helper. def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None): url = (base or BASE) + path @@ -2487,7 +2497,13 @@ class ThoughtState: return f"Still thinking ({detail})." -def _ollama_json_call(prompt: str, *, context: str, retries: int = 2) -> dict[str, Any]: +def _ollama_json_call( + prompt: str, + *, + context: str, + retries: int = 2, + model: str | None = None, +) -> dict[str, Any]: system = ( "System: You are Atlas, a reasoning assistant. " "Return strict JSON only (no code fences, no trailing commentary). " @@ -2504,6 +2520,7 @@ def _ollama_json_call(prompt: str, *, context: str, retries: int = 2) -> dict[st context=context, use_history=False, system_override=system, + model=model, ) cleaned = _strip_code_fence(raw).strip() if cleaned.startswith("{") and cleaned.endswith("}"): @@ -2547,6 +2564,19 @@ def _fact_pack_text(lines: list[str]) -> str: return "Fact pack:\n" + "\n".join(labeled) +def _tool_fact_lines(prompt: str, *, allow_tools: bool) -> list[str]: + if not allow_tools: + return [] + metrics_context, _ = metrics_query_context(prompt, allow_tools=True) + lines: list[str] = [] + if metrics_context: + for line in metrics_context.splitlines(): + trimmed = line.strip() + if trimmed: + lines.append(f"tool_metrics: {trimmed}") + return lines + + _ALLOWED_INSIGHT_TAGS = { "availability", "architecture", @@ -2607,6 +2637,15 @@ def _history_tags(history_lines: list[str]) -> set[str]: return tags & _ALLOWED_INSIGHT_TAGS +def _normalize_fraction(value: Any, *, default: float = 0.5) -> float: + if isinstance(value, (int, float)): + score = float(value) + if score > 1: + score = score / 100.0 + return max(0.0, min(1.0, score)) + return default + + def _seed_insights( lines: list[str], fact_meta: dict[str, dict[str, Any]], @@ -2735,9 +2774,9 @@ def _open_ended_system() -> str: "Use ONLY the provided fact pack and recent chat as your evidence. " "You may draw light inferences if you label them as such. " "Write concise, human sentences with a helpful, calm tone (not a list). " - "If the question is subjective, share a light opinion grounded in facts. " + "If the question is subjective, share a light opinion grounded in facts and explain why it stands out. " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " - "Avoid repeating the exact same observation as the last response if possible. " + "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " "Do not invent numbers or facts. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)." ) @@ -2750,6 +2789,7 @@ def _ollama_call_safe( context: str, fallback: str, system_override: str | None = None, + model: str | None = None, ) -> str: try: return _ollama_call( @@ -2758,6 +2798,7 @@ def _ollama_call_safe( context=context, use_history=False, system_override=system_override, + model=model, ) except Exception: return fallback @@ -2841,6 +2882,7 @@ def _open_ended_plan( history_lines: list[str], count: int, state: ThoughtState | None, + model: str | None, ) -> list[dict[str, Any]]: if state: state.update("planning", step=1, note="mapping angles") @@ -2850,10 +2892,15 @@ def _open_ended_plan( f"{count} distinct answer angles that can be supported by the fact pack. " "Keep them diverse (e.g., metrics, hardware, workload placement, recent changes). " "If the question is subjective, propose at least one angle that surfaces a standout detail. " + "Avoid repeating the same angle as the most recent response if possible. " "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"priority\":1-5}]}." ) context = _append_history_context(fact_pack, history_lines) - result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) angles = result.get("angles") if isinstance(result, dict) else None cleaned: list[dict[str, Any]] = [] seen: set[str] = set() @@ -2883,6 +2930,81 @@ def _open_ended_plan( return cleaned +def _preferred_tags_for_prompt(prompt: str) -> set[str]: + q = normalize_query(prompt) + tags: set[str] = set() + if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")): + tags.add("utilization") + if any(word in q for word in ("postgres", "database", "db", "connections")): + tags.add("database") + if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")): + tags.add("pods") + if any(word in q for word in ("workload", "service", "namespace")): + tags.add("workloads") + if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")): + tags.add("availability") + if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")): + tags.update({"hardware", "inventory", "architecture"}) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _open_ended_insights( + prompt: str, + *, + fact_pack: str, + fact_meta: dict[str, dict[str, Any]], + history_lines: list[str], + count: int, + state: ThoughtState | None, + model: str | None, +) -> list[dict[str, Any]]: + if state: + state.update("analyzing", note="scouting insights") + count = max(1, count) + allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS)) + prompt_text = ( + "Review the fact pack and propose up to " + f"{count} insights that could answer the question. " + "Each insight should be grounded in the facts. " + "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"]," + "\"relevance\":0-1,\"novelty\":0-1,\"tags\":[\"tag\"],\"rationale\":\"...\"}]}. " + f"Only use tags from: {allowed_tags}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) + insights = result.get("insights") if isinstance(result, dict) else None + cleaned: list[dict[str, Any]] = [] + valid_ids = set(fact_meta.keys()) + if isinstance(insights, list): + for item in insights: + if not isinstance(item, dict): + continue + summary = str(item.get("summary") or item.get("claim") or "").strip() + if not summary: + continue + raw_ids = item.get("fact_ids") if isinstance(item.get("fact_ids"), list) else [] + fact_ids = [fid for fid in raw_ids if isinstance(fid, str) and fid in valid_ids] + if not fact_ids: + continue + cleaned.append( + { + "summary": summary, + "fact_ids": fact_ids, + "relevance": _normalize_fraction(item.get("relevance"), default=0.6), + "novelty": _normalize_fraction(item.get("novelty"), default=0.5), + "rationale": str(item.get("rationale") or ""), + "tags": [t for t in (item.get("tags") or []) if isinstance(t, str)], + } + ) + if cleaned and state: + state.update("analyzing", note=_candidate_note(cleaned[0])) + return cleaned + + def _normalize_score(value: Any, *, default: int = 60) -> int: if isinstance(value, (int, float)): return int(max(0, min(100, value))) @@ -2915,20 +3037,31 @@ def _open_ended_candidate( history_lines: list[str], state: ThoughtState | None, step: int, + fact_hints: list[str] | None = None, + model: str | None = None, ) -> dict[str, Any]: if state: state.update("drafting", step=step, note=focus) + hint_text = "" + if fact_hints: + hint_text = " Prioritize these fact IDs if relevant: " + ", ".join(fact_hints) + "." prompt_text = ( "Using ONLY the fact pack, answer the question focusing on this angle: " f"{focus}. " - "Write 2-4 sentences in plain prose (not a list). " + "Write 2-4 sentences in plain prose (not a list)." + + hint_text + + " " "If you infer, label it as inference. " "List which fact pack IDs you used. " "Return JSON: {\"answer\":\"...\",\"facts_used\":[\"F1\"],\"confidence\":\"high|medium|low\"," "\"relevance\":0-100,\"satisfaction\":0-100,\"risk\":\"low|medium|high\"}." ) context = _append_history_context(fact_pack, history_lines) - result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) if not isinstance(result, dict): result = {} answer = str(result.get("answer") or "").strip() @@ -2986,9 +3119,12 @@ def _open_ended_synthesize( candidates: list[dict[str, Any]], state: ThoughtState | None, step: int, + model: str | None, + critique: str | None = None, ) -> str: if state: state.update("synthesizing", step=step, note="composing answer") + critique_block = f"\nCritique guidance: {critique}\n" if critique else "\n" synth_prompt = ( "Compose the final answer to the question using the candidate answers below. " "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. " @@ -3001,6 +3137,7 @@ def _open_ended_synthesize( "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), " "HallucinationRisk (low|medium|high).\n" f"Question: {prompt}\n" + f"{critique_block}" f"Candidates: {json.dumps(candidates, ensure_ascii=False)}" ) context = _append_history_context(fact_pack, history_lines) @@ -3010,20 +3147,55 @@ def _open_ended_synthesize( context=context, fallback="I don't have enough data to answer that.", system_override=_open_ended_system(), + model=model, ) return _ensure_scores(reply) +def _open_ended_critique( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + candidates: list[dict[str, Any]], + state: ThoughtState | None, + step: int, + model: str | None, +) -> str: + if state: + state.update("reviewing", step=step, note="quality check") + critique_prompt = ( + "Review the candidate answers against the fact pack. " + "Identify any missing important detail or risky inference and give one sentence of guidance. " + "Return JSON: {\"guidance\":\"...\",\"risk\":\"low|medium|high\"}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + critique_prompt + f" Question: {prompt} Candidates: {json.dumps(candidates, ensure_ascii=False)}", + context=context, + model=model, + ) + if isinstance(result, dict): + guidance = str(result.get("guidance") or "").strip() + if guidance: + return guidance + return "" + + def _open_ended_multi( prompt: str, *, fact_pack: str, + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], history_lines: list[str], mode: str, state: ThoughtState | None = None, ) -> str: + model = _model_for_mode(mode) angle_count = 2 if mode == "fast" else 4 - total_steps = 1 + angle_count + 2 + insight_count = 2 if mode == "fast" else 4 + total_steps = 2 + angle_count + 2 + (1 if mode == "deep" else 0) if state: state.total_steps = total_steps angles = _open_ended_plan( @@ -3032,10 +3204,57 @@ def _open_ended_multi( history_lines=history_lines, count=angle_count, state=state, + model=model, ) + insights = _open_ended_insights( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + count=insight_count, + state=state, + model=model, + ) + seeds = _seed_insights(fact_lines, fact_meta, limit=max(4, insight_count)) + insight_candidates = insights + seeds + subjective = _is_subjective_query(prompt) + prefer_tags = _preferred_tags_for_prompt(prompt) + history_tags = _history_tags(history_lines) + avoid_tags = history_tags if subjective else set() + preference = "novelty" if subjective else "relevance" + selected_insights = _select_diverse_insights( + insight_candidates, + preference=preference, + prefer_tags=prefer_tags, + avoid_tags=avoid_tags, + history_tags=history_tags, + fact_meta=fact_meta, + count=1 if mode == "fast" else 2, + ) + if state and selected_insights: + state.update("analyzing", note=_candidate_note(selected_insights[0])) + + angle_inputs: list[dict[str, Any]] = [] + for insight in selected_insights: + angle_inputs.append( + { + "focus": str(insight.get("summary") or "Direct answer"), + "fact_ids": insight.get("fact_ids") or [], + } + ) + for angle in angles: + if len(angle_inputs) >= angle_count: + break + angle_inputs.append( + { + "focus": str(angle.get("focus") or "Direct answer"), + "fact_ids": [], + } + ) + candidates: list[dict[str, Any]] = [] - step = 2 - for angle in angles[:angle_count]: + step = 3 + for angle in angle_inputs[:angle_count]: candidates.append( _open_ended_candidate( prompt, @@ -3044,6 +3263,8 @@ def _open_ended_multi( history_lines=history_lines, state=state, step=step, + fact_hints=angle.get("fact_ids") if isinstance(angle.get("fact_ids"), list) else None, + model=model, ) ) step += 1 @@ -3051,6 +3272,18 @@ def _open_ended_multi( state.update("evaluating", step=step, note="ranking candidates") selected = _select_candidates(candidates, count=1 if mode == "fast" else 2) step += 1 + critique = "" + if mode == "deep": + critique = _open_ended_critique( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + candidates=selected or candidates, + state=state, + step=step, + model=model, + ) + step += 1 reply = _open_ended_synthesize( prompt, fact_pack=fact_pack, @@ -3058,6 +3291,8 @@ def _open_ended_multi( candidates=selected or candidates, state=state, step=step, + model=model, + critique=critique, ) if state: state.update("done", step=total_steps) @@ -3066,19 +3301,23 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: angle_count = 2 if mode == "fast" else 4 - return 1 + angle_count + 2 + return 2 + angle_count + 2 + (1 if mode == "deep" else 0) def _open_ended_fast( prompt: str, *, fact_pack: str, + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], history_lines: list[str], state: ThoughtState | None = None, ) -> str: return _open_ended_multi( prompt, fact_pack=fact_pack, + fact_lines=fact_lines, + fact_meta=fact_meta, history_lines=history_lines, mode="fast", state=state, @@ -3089,12 +3328,16 @@ def _open_ended_deep( prompt: str, *, fact_pack: str, + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], history_lines: list[str], state: ThoughtState | None = None, ) -> str: return _open_ended_multi( prompt, fact_pack=fact_pack, + fact_lines=fact_lines, + fact_meta=fact_meta, history_lines=history_lines, mode="deep", state=state, @@ -3109,31 +3352,61 @@ def open_ended_answer( workloads: list[dict[str, Any]], history_lines: list[str], mode: str, + allow_tools: bool, state: ThoughtState | None = None, ) -> str: lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + if _knowledge_intent(prompt) or _doc_intent(prompt): + kb_detail = kb_retrieve(prompt) + if kb_detail: + for line in kb_detail.splitlines(): + if line.strip(): + lines.append(line.strip()) + tool_lines = _tool_fact_lines(prompt, allow_tools=allow_tools) + if tool_lines: + lines.extend(tool_lines) if not lines: return _ensure_scores("I don't have enough data to answer that.") fact_pack = _fact_pack_text(lines) + fact_meta = _fact_pack_meta(lines) if mode == "fast": return _open_ended_fast( prompt, fact_pack=fact_pack, + fact_lines=lines, + fact_meta=fact_meta, history_lines=history_lines, state=state, ) return _open_ended_deep( prompt, fact_pack=fact_pack, + fact_lines=lines, + fact_meta=fact_meta, history_lines=history_lines, state=state, ) -def _non_cluster_reply(prompt: str) -> str: - return _ensure_scores( - "I focus on the Atlas/Othrys cluster and don't have enough data to answer that." +def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> str: + system = ( + "System: You are Atlas, a helpful general assistant. " + "Answer using common knowledge when possible, and say when you're unsure. " + "Be concise and avoid unnecessary caveats. " + "Respond in plain sentences (no lists unless asked). " + "End every response with a line: 'Confidence: high|medium|low'." ) + model = _model_for_mode(mode) + context = _append_history_context("", history_lines) if history_lines else "" + reply = _ollama_call( + ("general", "reply"), + prompt, + context=context, + use_history=False, + system_override=system, + model=model, + ) + return _ensure_scores(reply) # Internal HTTP endpoint for cluster answers (website uses this). @@ -3183,7 +3456,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): return cleaned = _strip_bot_mention(prompt) mode = str(payload.get("mode") or "deep").lower() - if mode not in ("fast", "deep"): + if mode in ("quick", "fast"): + mode = "fast" + elif mode in ("smart", "deep"): + mode = "deep" + else: mode = "deep" snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() @@ -3212,37 +3489,19 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot=snapshot, workloads=workloads, ) - fallback = "I don't have enough data to answer that." if cluster_query: - open_ended = ( - _is_subjective_query(cleaned) - or _knowledge_intent(cleaned) - or _is_overview_query(cleaned) - or _doc_intent(cleaned) + answer = open_ended_answer( + cleaned, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history_lines, + mode=mode, + allow_tools=False, + state=None, ) - if open_ended: - answer = open_ended_answer( - cleaned, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history_lines, - mode=mode, - state=None, - ) - else: - answer = ( - cluster_answer( - cleaned, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history_lines, - ) - or fallback - ) else: - answer = _non_cluster_reply(cleaned) + answer = _non_cluster_reply(cleaned, history_lines=history_lines, mode=mode) self._write_json(200, {"answer": answer}) @@ -3490,6 +3749,7 @@ def _ollama_call( context: str, use_history: bool = True, system_override: str | None = None, + model: str | None = None, ) -> str: system = system_override or ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " @@ -3521,7 +3781,8 @@ def _ollama_call( messages.extend(_history_to_messages(history[hist_key][-24:])) messages.append({"role": "user", "content": prompt}) - payload = {"model": MODEL, "messages": messages, "stream": False} + model_name = model or MODEL + payload = {"model": model_name, "messages": messages, "stream": False} headers = {"Content-Type": "application/json"} if API_KEY: headers["x-api-key"] = API_KEY @@ -3561,11 +3822,18 @@ def ollama_reply( context: str, fallback: str = "", use_history: bool = True, + model: str | None = None, ) -> str: last_error = None for attempt in range(max(1, OLLAMA_RETRIES + 1)): try: - return _ollama_call(hist_key, prompt, context=context, use_history=use_history) + return _ollama_call( + hist_key, + prompt, + context=context, + use_history=use_history, + model=model, + ) except Exception as exc: # noqa: BLE001 last_error = exc time.sleep(min(4, 2 ** attempt)) @@ -3584,6 +3852,7 @@ def ollama_reply_with_thinking( context: str, fallback: str, use_history: bool = True, + model: str | None = None, ) -> str: result: dict[str, str] = {"reply": ""} done = threading.Event() @@ -3595,6 +3864,7 @@ def ollama_reply_with_thinking( context=context, fallback=fallback, use_history=use_history, + model=model, ) done.set() @@ -3627,6 +3897,7 @@ def open_ended_with_thinking( workloads: list[dict[str, Any]], history_lines: list[str], mode: str, + allow_tools: bool, ) -> str: result: dict[str, str] = {"reply": ""} done = threading.Event() @@ -3641,6 +3912,7 @@ def open_ended_with_thinking( workloads=workloads, history_lines=history_lines, mode=mode, + allow_tools=allow_tools, state=state, ) done.set() @@ -3766,39 +4038,24 @@ def sync_loop(token: str, room_id: str): extra = "VictoriaMetrics (PromQL result):\n" + rendered send_msg(token, rid, extra) continue - fallback = "I don't have enough data to answer that." - if cluster_query: - open_ended = ( - _is_subjective_query(cleaned_body) - or _knowledge_intent(cleaned_body) - or _is_overview_query(cleaned_body) - or _doc_intent(cleaned_body) + reply = open_ended_with_thinking( + token, + rid, + cleaned_body, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history[hist_key], + mode=mode if mode in ("fast", "deep") else "deep", + allow_tools=allow_tools, ) - if open_ended: - reply = open_ended_with_thinking( - token, - rid, - cleaned_body, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history[hist_key], - mode=mode if mode in ("fast", "deep") else "deep", - ) - else: - reply = ( - cluster_answer( - cleaned_body, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history[hist_key], - ) - or fallback - ) else: - reply = _non_cluster_reply(cleaned_body) + reply = _non_cluster_reply( + cleaned_body, + history_lines=history[hist_key], + mode=mode if mode in ("fast", "deep") else "deep", + ) send_msg(token, rid, reply) history[hist_key].append(f"Atlas: {reply}") history[hist_key] = history[hist_key][-80:] From 2d900050767a79c476f10befc4c2c22739f7500d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:49:28 -0300 Subject: [PATCH 385/416] atlasbot: fix insight scoring --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index b08f20d..26699b3 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-82 + checksum/atlasbot-configmap: manual-atlasbot-83 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7e6341e..dd6ea2e 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2724,6 +2724,14 @@ def _insight_score( return base +def _score_insight(insight: dict[str, Any], preference: str) -> float: + relevance = _normalize_fraction(insight.get("relevance"), default=0.6) + novelty = _normalize_fraction(insight.get("novelty"), default=0.5) + if preference == "novelty": + return novelty * 0.6 + relevance * 0.4 + return relevance * 0.6 + novelty * 0.4 + + def _select_diverse_insights( candidates: list[dict[str, Any]], *, From 2af817b9dbb6ff115a951d9f0201f32307dac467 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:57:36 -0300 Subject: [PATCH 386/416] atlasbot: speed up fast mode --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 57 +++++++++++++++---------- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 26699b3..b9b8ea7 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-83 + checksum/atlasbot-configmap: manual-atlasbot-84 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index dd6ea2e..9108478 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3201,28 +3201,37 @@ def _open_ended_multi( state: ThoughtState | None = None, ) -> str: model = _model_for_mode(mode) - angle_count = 2 if mode == "fast" else 4 - insight_count = 2 if mode == "fast" else 4 - total_steps = 2 + angle_count + 2 + (1 if mode == "deep" else 0) + if mode == "fast": + angle_count = 1 + insight_count = 1 + total_steps = 2 + else: + angle_count = 4 + insight_count = 4 + total_steps = 2 + angle_count + 2 + 1 if state: state.total_steps = total_steps - angles = _open_ended_plan( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - count=angle_count, - state=state, - model=model, - ) - insights = _open_ended_insights( - prompt, - fact_pack=fact_pack, - fact_meta=fact_meta, - history_lines=history_lines, - count=insight_count, - state=state, - model=model, - ) + + angles: list[dict[str, Any]] = [] + insights: list[dict[str, Any]] = [] + if mode != "fast": + angles = _open_ended_plan( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + count=angle_count, + state=state, + model=model, + ) + insights = _open_ended_insights( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + count=insight_count, + state=state, + model=model, + ) seeds = _seed_insights(fact_lines, fact_meta, limit=max(4, insight_count)) insight_candidates = insights + seeds subjective = _is_subjective_query(prompt) @@ -3261,7 +3270,7 @@ def _open_ended_multi( ) candidates: list[dict[str, Any]] = [] - step = 3 + step = 1 if mode == "fast" else 3 for angle in angle_inputs[:angle_count]: candidates.append( _open_ended_candidate( @@ -3308,8 +3317,10 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: - angle_count = 2 if mode == "fast" else 4 - return 2 + angle_count + 2 + (1 if mode == "deep" else 0) + if mode == "fast": + return 2 + angle_count = 4 + return 2 + angle_count + 2 + 1 def _open_ended_fast( From 44c22e3d004ab81dd3b798c88a0422f0650ee39e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 00:22:32 -0300 Subject: [PATCH 387/416] atlasbot: improve multi-pass synthesis --- services/comms/scripts/atlasbot/bot.py | 307 +++++++++++++++++++------ 1 file changed, 239 insertions(+), 68 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 9108478..df718e6 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2559,8 +2559,13 @@ def _fact_pack_lines( return lines -def _fact_pack_text(lines: list[str]) -> str: - labeled = [f"F{idx + 1}: {line}" for idx, line in enumerate(lines)] +def _fact_pack_text(lines: list[str], fact_meta: dict[str, dict[str, Any]]) -> str: + labeled: list[str] = [] + for idx, line in enumerate(lines): + fid = f"F{idx + 1}" + tags = fact_meta.get(fid, {}).get("tags") or [] + tag_text = f" [tags: {', '.join(tags)}]" if tags else "" + labeled.append(f"{fid}{tag_text}: {line}") return "Fact pack:\n" + "\n".join(labeled) @@ -2782,7 +2787,8 @@ def _open_ended_system() -> str: "Use ONLY the provided fact pack and recent chat as your evidence. " "You may draw light inferences if you label them as such. " "Write concise, human sentences with a helpful, calm tone (not a list). " - "If the question is subjective, share a light opinion grounded in facts and explain why it stands out. " + "If the question is subjective (cool/interesting/unconventional), pick a standout fact and explain why it stands out. " + "If the question asks for a list, embed the list inline in a sentence (comma-separated). " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " "Do not invent numbers or facts. " @@ -2938,6 +2944,67 @@ def _open_ended_plan( return cleaned +def _sanitize_focus_tags(raw_tags: list[Any]) -> list[str]: + tags: list[str] = [] + for tag in raw_tags: + if not isinstance(tag, str): + continue + tag = tag.strip() + if tag in _ALLOWED_INSIGHT_TAGS and tag not in tags: + tags.append(tag) + return tags + + +def _open_ended_interpret( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + state: ThoughtState | None, + model: str | None, +) -> dict[str, Any]: + if state: + state.update("interpreting", step=1, note="reading question") + allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS)) + prompt_text = ( + "Classify how to answer the question using only the fact pack. " + "Return JSON: {\"style\":\"objective|subjective\"," + "\"tone\":\"neutral|curious|enthusiastic\"," + "\"focus_tags\":[\"tag\"]," + "\"focus_label\":\"short phrase\"," + "\"allow_list\":true|false}. " + "Use allow_list=true only if the question explicitly asks for names or lists. " + f"Only use tags from: {allowed_tags}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) + if not isinstance(result, dict): + result = {} + style = str(result.get("style") or "").strip().lower() + if style not in ("objective", "subjective"): + style = "subjective" if _is_subjective_query(prompt) else "objective" + tone = str(result.get("tone") or "neutral").strip().lower() + if tone not in ("neutral", "curious", "enthusiastic"): + tone = "neutral" + focus_tags = _sanitize_focus_tags(result.get("focus_tags") or []) + focus_label = str(result.get("focus_label") or "").strip() + allow_list = result.get("allow_list") + if not isinstance(allow_list, bool): + q = normalize_query(prompt) + allow_list = any(phrase in q for phrase in ("list", "which", "what are", "names")) + return { + "style": style, + "tone": tone, + "focus_tags": focus_tags, + "focus_label": focus_label, + "allow_list": allow_list, + } + + def _preferred_tags_for_prompt(prompt: str) -> set[str]: q = normalize_query(prompt) tags: set[str] = set() @@ -3013,6 +3080,71 @@ def _open_ended_insights( return cleaned +def _fallback_fact_ids( + fact_meta: dict[str, dict[str, Any]], + *, + focus_tags: set[str], + count: int, +) -> list[str]: + if not fact_meta: + return [] + if focus_tags: + tagged = [ + fid + for fid, meta in fact_meta.items() + if focus_tags & set(meta.get("tags") or []) + ] + if tagged: + return tagged[:count] + return list(fact_meta.keys())[:count] + + +def _open_ended_select_facts( + prompt: str, + *, + fact_pack: str, + fact_meta: dict[str, dict[str, Any]], + history_lines: list[str], + focus_tags: set[str], + avoid_fact_ids: list[str], + count: int, + subjective: bool, + state: ThoughtState | None, + step: int, + model: str | None, +) -> list[str]: + if state: + state.update("selecting facts", step=step, note="picking evidence") + focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any" + avoid_hint = ", ".join(avoid_fact_ids) if avoid_fact_ids else "none" + prompt_text = ( + "Select the fact IDs that best answer the question. " + f"Pick up to {count} fact IDs. " + f"Focus tags: {focus_hint}. " + f"Avoid these fact IDs: {avoid_hint}. " + "If the question is subjective, pick standout or unusual facts; " + "if objective, pick the minimal facts needed. " + "Return JSON: {\"fact_ids\":[\"F1\"...],\"note\":\"...\"}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) + fact_ids = result.get("fact_ids") if isinstance(result, dict) else None + selected: list[str] = [] + if isinstance(fact_ids, list): + for fid in fact_ids: + if isinstance(fid, str) and fid in fact_meta and fid not in selected: + selected.append(fid) + if len(selected) >= count: + break + if not selected: + selected = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count) + return selected + + def _normalize_score(value: Any, *, default: int = 60) -> int: if isinstance(value, (int, float)): return int(max(0, min(100, value))) @@ -3043,6 +3175,9 @@ def _open_ended_candidate( focus: str, fact_pack: str, history_lines: list[str], + subjective: bool, + tone: str, + allow_list: bool, state: ThoughtState | None, step: int, fact_hints: list[str] | None = None, @@ -3053,10 +3188,23 @@ def _open_ended_candidate( hint_text = "" if fact_hints: hint_text = " Prioritize these fact IDs if relevant: " + ", ".join(fact_hints) + "." + style_hint = ( + "Offer a brief opinion grounded in facts and explain why it stands out. " + if subjective + else "Answer directly and succinctly. " + ) + list_hint = ( + "If a list is requested, embed it inline in a sentence (comma-separated). " + if allow_list + else "Avoid bullet lists. " + ) prompt_text = ( "Using ONLY the fact pack, answer the question focusing on this angle: " f"{focus}. " - "Write 2-4 sentences in plain prose (not a list)." + f"Tone: {tone}. " + + style_hint + + list_hint + + "Write 2-4 sentences in plain prose." + hint_text + " " "If you infer, label it as inference. " @@ -3125,6 +3273,9 @@ def _open_ended_synthesize( fact_pack: str, history_lines: list[str], candidates: list[dict[str, Any]], + subjective: bool, + tone: str, + allow_list: bool, state: ThoughtState | None, step: int, model: str | None, @@ -3133,6 +3284,16 @@ def _open_ended_synthesize( if state: state.update("synthesizing", step=step, note="composing answer") critique_block = f"\nCritique guidance: {critique}\n" if critique else "\n" + style_hint = ( + "If the question is subjective, share a light opinion grounded in facts and explain why it stands out. " + if subjective + else "Answer directly without extra caveats. " + ) + list_hint = ( + "If a list is requested, embed it inline in a sentence (comma-separated). " + if allow_list + else "Avoid bullet lists. " + ) synth_prompt = ( "Compose the final answer to the question using the candidate answers below. " "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. " @@ -3140,7 +3301,10 @@ def _open_ended_synthesize( "If you infer, label it as inference. " "Do not claim nodes are missing or not ready unless the fact pack explicitly lists " "nodes_not_ready or expected_workers_missing. " - "Keep the tone conversational and answer the user's intent directly. " + f"Tone: {tone}. " + + style_hint + + list_hint + + "Keep the tone conversational and answer the user's intent directly. " "Avoid repeating the last response if possible. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), " "HallucinationRisk (low|medium|high).\n" @@ -3202,85 +3366,90 @@ def _open_ended_multi( ) -> str: model = _model_for_mode(mode) if mode == "fast": - angle_count = 1 - insight_count = 1 - total_steps = 2 + total_steps = 4 else: - angle_count = 4 - insight_count = 4 - total_steps = 2 + angle_count + 2 + 1 + total_steps = 7 if state: state.total_steps = total_steps - angles: list[dict[str, Any]] = [] - insights: list[dict[str, Any]] = [] - if mode != "fast": - angles = _open_ended_plan( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - count=angle_count, - state=state, - model=model, - ) - insights = _open_ended_insights( + interpretation = _open_ended_interpret( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + state=state, + model=model, + ) + style = interpretation.get("style") or "objective" + subjective = style == "subjective" or _is_subjective_query(prompt) + tone = str(interpretation.get("tone") or "").strip().lower() + if tone not in ("neutral", "curious", "enthusiastic"): + tone = "curious" if subjective else "neutral" + allow_list = bool(interpretation.get("allow_list")) + focus_tags = set(interpretation.get("focus_tags") or []) or _preferred_tags_for_prompt(prompt) + if not focus_tags and subjective: + focus_tags = set(_ALLOWED_INSIGHT_TAGS) + + primary_ids = _open_ended_select_facts( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + focus_tags=focus_tags, + avoid_fact_ids=[], + count=4 if mode == "deep" else 3, + subjective=subjective, + state=state, + step=2, + model=model, + ) + alternate_ids: list[str] = [] + if mode == "deep": + alternate_ids = _open_ended_select_facts( prompt, fact_pack=fact_pack, fact_meta=fact_meta, history_lines=history_lines, - count=insight_count, + focus_tags=focus_tags, + avoid_fact_ids=primary_ids, + count=4, + subjective=subjective, state=state, + step=3, model=model, ) - seeds = _seed_insights(fact_lines, fact_meta, limit=max(4, insight_count)) - insight_candidates = insights + seeds - subjective = _is_subjective_query(prompt) - prefer_tags = _preferred_tags_for_prompt(prompt) - history_tags = _history_tags(history_lines) - avoid_tags = history_tags if subjective else set() - preference = "novelty" if subjective else "relevance" - selected_insights = _select_diverse_insights( - insight_candidates, - preference=preference, - prefer_tags=prefer_tags, - avoid_tags=avoid_tags, - history_tags=history_tags, - fact_meta=fact_meta, - count=1 if mode == "fast" else 2, - ) - if state and selected_insights: - state.update("analyzing", note=_candidate_note(selected_insights[0])) - - angle_inputs: list[dict[str, Any]] = [] - for insight in selected_insights: - angle_inputs.append( - { - "focus": str(insight.get("summary") or "Direct answer"), - "fact_ids": insight.get("fact_ids") or [], - } - ) - for angle in angles: - if len(angle_inputs) >= angle_count: - break - angle_inputs.append( - { - "focus": str(angle.get("focus") or "Direct answer"), - "fact_ids": [], - } - ) candidates: list[dict[str, Any]] = [] - step = 1 if mode == "fast" else 3 - for angle in angle_inputs[:angle_count]: + focus_label = interpretation.get("focus_label") or "primary angle" + step = 3 if mode == "fast" else 4 + candidates.append( + _open_ended_candidate( + prompt, + focus=str(focus_label), + fact_pack=fact_pack, + history_lines=history_lines, + subjective=subjective, + tone=str(tone), + allow_list=allow_list, + state=state, + step=step, + fact_hints=primary_ids, + model=model, + ) + ) + step += 1 + if mode == "deep" and alternate_ids: candidates.append( _open_ended_candidate( prompt, - focus=str(angle.get("focus") or "Direct answer"), + focus="alternate angle", fact_pack=fact_pack, history_lines=history_lines, + subjective=subjective, + tone=str(tone), + allow_list=allow_list, state=state, step=step, - fact_hints=angle.get("fact_ids") if isinstance(angle.get("fact_ids"), list) else None, + fact_hints=alternate_ids, model=model, ) ) @@ -3306,6 +3475,9 @@ def _open_ended_multi( fact_pack=fact_pack, history_lines=history_lines, candidates=selected or candidates, + subjective=subjective, + tone=str(tone), + allow_list=allow_list, state=state, step=step, model=model, @@ -3318,9 +3490,8 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: if mode == "fast": - return 2 - angle_count = 4 - return 2 + angle_count + 2 + 1 + return 4 + return 7 def _open_ended_fast( @@ -3386,8 +3557,8 @@ def open_ended_answer( lines.extend(tool_lines) if not lines: return _ensure_scores("I don't have enough data to answer that.") - fact_pack = _fact_pack_text(lines) fact_meta = _fact_pack_meta(lines) + fact_pack = _fact_pack_text(lines, fact_meta) if mode == "fast": return _open_ended_fast( prompt, From 6578a8b08ac0b693f26c878dc525c473c6beee91 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 00:24:13 -0300 Subject: [PATCH 388/416] atlasbot: roll config --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index b9b8ea7..bc6790b 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-84 + checksum/atlasbot-configmap: manual-atlasbot-85 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 474c472b1dd18d0430bfd262048dba3c06d5007a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:02:14 -0300 Subject: [PATCH 389/416] atlasbot: enrich fact pack and selection --- services/comms/scripts/atlasbot/bot.py | 104 +++++++++++++++++++++++-- 1 file changed, 96 insertions(+), 8 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index df718e6..55c6da2 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -936,6 +936,28 @@ def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]: per_node.setdefault(node, {})[metric_name] = entry.get("value") return [{"node": node, **vals} for node, vals in sorted(per_node.items())] +def _usage_extremes(usage_table: list[dict[str, Any]]) -> dict[str, tuple[str, float]]: + extremes: dict[str, tuple[str, float]] = {} + for metric in ("cpu", "ram", "net", "io"): + values: list[tuple[str, float]] = [] + for entry in usage_table: + node = entry.get("node") + raw = entry.get(metric) + if not node or raw is None: + continue + try: + value = float(raw) + except (TypeError, ValueError): + continue + values.append((node, value)) + if not values: + continue + lowest = min(values, key=lambda item: item[1]) + highest = max(values, key=lambda item: item[1]) + extremes[f"min_{metric}"] = lowest + extremes[f"max_{metric}"] = highest + return extremes + def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> list[dict[str, Any]]: cleaned: list[dict[str, Any]] = [] for entry in workloads: @@ -1023,6 +1045,13 @@ def facts_context( lines.append(f"- arch {key}: {', '.join(nodes_list)}") if control_plane_nodes: lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}") + control_plane_by_hw: dict[str, list[str]] = collections.defaultdict(list) + for node in inv: + if node.get("name") in control_plane_nodes: + control_plane_by_hw[node.get("hardware") or "unknown"].append(node["name"]) + parts = [f"{hw}={', '.join(sorted(nodes))}" for hw, nodes in sorted(control_plane_by_hw.items())] + if parts: + lines.append(f"- control_plane_by_hardware: {', '.join(parts)}") if worker_nodes: lines.append(f"- worker_nodes: {', '.join(worker_nodes)}") if ready_workers or not_ready_workers: @@ -1068,6 +1097,22 @@ def facts_context( if value is not None: lines.append(f"- {key}: {value}") + top_restarts = metrics.get("top_restarts_1h") if isinstance(metrics.get("top_restarts_1h"), list) else [] + if top_restarts: + items = [] + for entry in top_restarts[:5]: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") or {} + pod = metric.get("pod") or metric.get("name") or "" + ns = metric.get("namespace") or "" + value = entry.get("value") + label = f"{ns}/{pod}".strip("/") + if label and value is not None: + items.append(f"{label}={value}") + if items: + lines.append(f"- top_restarts_1h: {', '.join(items)}") + usage_table = _node_usage_table(metrics) if usage_table: lines.append("- node_usage (cpu/ram/net/io):") @@ -1088,6 +1133,18 @@ def facts_context( else "" ) lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") + extremes = _usage_extremes(usage_table) + for metric in ("cpu", "ram", "net", "io"): + min_key = f"min_{metric}" + if min_key not in extremes: + continue + node, value = extremes[min_key] + value_fmt = _format_metric_value( + str(value), + percent=metric in ("cpu", "ram"), + rate=metric in ("net", "io"), + ) + lines.append(f"- lowest_{metric}: {node} ({value_fmt})") if nodes_in_query: lines.append("- node_details:") @@ -1112,13 +1169,37 @@ def facts_context( wl = entry.get("workload") or "" primary = entry.get("primary_node") or "" pods_total = entry.get("pods_total") + pods_running = entry.get("pods_running") label = f"{ns}/{wl}" if ns and wl else (wl or ns) if not label: continue if primary: - lines.append(f" - {label}: primary_node={primary}, pods_total={pods_total}") + lines.append( + f" - {label}: primary_node={primary}, pods_total={pods_total}, pods_running={pods_running}" + ) else: - lines.append(f" - {label}: pods_total={pods_total}") + lines.append(f" - {label}: pods_total={pods_total}, pods_running={pods_running}") + top = max( + (entry for entry in workload_entries if isinstance(entry.get("pods_total"), (int, float))), + key=lambda item: item.get("pods_total", 0), + default=None, + ) + if isinstance(top, dict) and top.get("pods_total") is not None: + label = f"{top.get('namespace')}/{top.get('workload')}".strip("/") + lines.append(f"- workload_most_pods: {label} ({top.get('pods_total')})") + zero_running = [ + entry + for entry in workload_entries + if isinstance(entry.get("pods_running"), (int, float)) and entry.get("pods_running") == 0 + ] + if zero_running: + labels = [] + for entry in zero_running: + label = f"{entry.get('namespace')}/{entry.get('workload')}".strip("/") + if label: + labels.append(label) + if labels: + lines.append(f"- workloads_zero_running: {', '.join(labels)}") rendered = "\n".join(lines) return rendered[:MAX_FACTS_CHARS] @@ -2609,15 +2690,15 @@ def _fact_line_tags(line: str) -> set[str]: tags.add("architecture") if any(key in text for key in ("rpi", "jetson", "amd64", "arm64", "non_raspberry_pi")): tags.update({"hardware", "inventory"}) - if "control_plane_nodes" in text or "worker_nodes" in text: + if "control_plane_nodes" in text or "control_plane_by_hardware" in text or "worker_nodes" in text: tags.add("inventory") - if any(key in text for key in ("hottest_", "node_usage", "cpu=", "ram=", "net=", "io=")): + if any(key in text for key in ("hottest_", "lowest_", "node_usage", "cpu=", "ram=", "net=", "io=")): tags.add("utilization") if "postgres_" in text or "postgres connections" in text: tags.add("database") - if "pods_" in text or "pod phases" in text: + if "pods_" in text or "pod phases" in text or "restarts" in text: tags.add("pods") - if "workloads" in text or "primary_node" in text: + if "workloads" in text or "primary_node" in text or "workload_" in text: tags.add("workloads") if "node_details" in text: tags.add("node_detail") @@ -3140,8 +3221,15 @@ def _open_ended_select_facts( selected.append(fid) if len(selected) >= count: break - if not selected: - selected = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count) + seed = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count) + if selected: + for fid in seed: + if fid not in selected: + selected.append(fid) + if len(selected) >= count: + break + else: + selected = seed return selected From 2fe3d5b932ec70621730cbb7600967db82368f6b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:02:32 -0300 Subject: [PATCH 390/416] atlasbot: roll config --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index bc6790b..7ce144c 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-85 + checksum/atlasbot-configmap: manual-atlasbot-86 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 60018764097b6891e9bcf07cdb57a13d57d14669 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:07:13 -0300 Subject: [PATCH 391/416] atlasbot: add per-hardware extremes --- services/comms/scripts/atlasbot/bot.py | 27 ++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 55c6da2..50fed4b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1145,6 +1145,33 @@ def facts_context( rate=metric in ("net", "io"), ) lines.append(f"- lowest_{metric}: {node} ({value_fmt})") + for metric in ("cpu", "ram"): + hottest_parts: list[str] = [] + lowest_parts: list[str] = [] + for hw, nodes_list in sorted(by_hardware.items()): + entries = [] + for entry in usage_table: + node = entry.get("node") + if node in nodes_list and entry.get(metric) is not None: + try: + value = float(entry.get(metric)) + except (TypeError, ValueError): + continue + entries.append((node, value)) + if not entries: + continue + max_node, max_val = max(entries, key=lambda item: item[1]) + min_node, min_val = min(entries, key=lambda item: item[1]) + hottest_parts.append( + f"{hw}={max_node} ({_format_metric_value(str(max_val), percent=True)})" + ) + lowest_parts.append( + f"{hw}={min_node} ({_format_metric_value(str(min_val), percent=True)})" + ) + if hottest_parts: + lines.append(f"- hottest_{metric}_by_hardware: {', '.join(hottest_parts)}") + if lowest_parts: + lines.append(f"- lowest_{metric}_by_hardware: {', '.join(lowest_parts)}") if nodes_in_query: lines.append("- node_details:") From 48c379dc88869606ef5c5ce05ca83d896f91c018 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:07:26 -0300 Subject: [PATCH 392/416] comms: roll atlasbot config --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7ce144c..1d89335 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-86 + checksum/atlasbot-configmap: manual-atlasbot-87 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 0146e3dc951234eeee04a34b96c2c4b5873da9e0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:35:34 -0300 Subject: [PATCH 393/416] maintenance: suspend ariadne migrate job --- services/maintenance/ariadne-migrate-job.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml index b9b1496..367a1a0 100644 --- a/services/maintenance/ariadne-migrate-job.yaml +++ b/services/maintenance/ariadne-migrate-job.yaml @@ -7,6 +7,7 @@ metadata: annotations: kustomize.toolkit.fluxcd.io/force: "true" spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: @@ -15,6 +16,7 @@ spec: app: ariadne-migrate annotations: vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/agent-pre-populate-only: "true" vault.hashicorp.com/role: "maintenance" vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | From 244578cc0120edab2766053ab62a513408434c29 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:48:32 -0300 Subject: [PATCH 394/416] chore: organize one-off jobs --- .../kustomization.yaml | 2 +- services/bstein-dev-home/kustomization.yaml | 2 +- .../migrations/kustomization.yaml | 2 +- .../migrations/portal-migrate-job.yaml | 7 ++++- .../portal-onboarding-e2e-test-job.yaml | 7 ++++- services/comms/kustomization.yaml | 20 ++++++------- .../{ => oneoffs}/bstein-force-leave-job.yaml | 7 ++++- .../comms-secrets-ensure-job.yaml | 7 ++++- .../mas-admin-client-secret-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/mas-db-ensure-job.yaml | 7 ++++- .../mas-local-users-ensure-job.yaml | 7 ++++- .../othrys-kick-numeric-job.yaml | 7 ++++- .../synapse-admin-ensure-job.yaml | 7 ++++- .../synapse-seeder-admin-ensure-job.yaml | 7 ++++- .../synapse-signingkey-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/synapse-user-seed-job.yaml | 7 ++++- services/finance/kustomization.yaml | 2 +- .../finance-secrets-ensure-job.yaml | 7 ++++- services/keycloak/kustomization.yaml | 30 +++++++++---------- .../actual-oidc-secret-ensure-job.yaml | 7 ++++- .../harbor-oidc-secret-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/ldap-federation-job.yaml | 7 ++++- .../logs-oidc-secret-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/mas-secrets-ensure-job.yaml | 7 ++++- ...portal-admin-client-secret-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/portal-e2e-client-job.yaml | 7 ++++- ...al-e2e-execute-actions-email-test-job.yaml | 7 ++++- .../portal-e2e-target-client-job.yaml | 7 ++++- ...al-e2e-token-exchange-permissions-job.yaml | 7 ++++- .../portal-e2e-token-exchange-test-job.yaml | 7 ++++- .../{ => oneoffs}/realm-settings-job.yaml | 7 ++++- .../synapse-oidc-secret-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/user-overrides-job.yaml | 7 ++++- .../vault-oidc-secret-ensure-job.yaml | 7 ++++- services/logging/kustomization.yaml | 6 ++-- .../opensearch-dashboards-setup-job.yaml | 7 ++++- .../{ => oneoffs}/opensearch-ism-job.yaml | 7 ++++- .../opensearch-observability-setup-job.yaml | 7 ++++- services/mailu/kustomization.yaml | 2 +- .../mailu/{ => oneoffs}/mailu-sync-job.yaml | 7 ++++- services/maintenance/kustomization.yaml | 4 +-- .../{ => oneoffs}/ariadne-migrate-job.yaml | 6 +++- .../k3s-traefik-cleanup-job.yaml | 7 ++++- services/monitoring/kustomization.yaml | 4 +-- .../{ => oneoffs}/grafana-org-bootstrap.yaml | 7 ++++- .../grafana-user-dedupe-job.yaml | 7 ++++- 46 files changed, 252 insertions(+), 73 deletions(-) rename services/bstein-dev-home/{ => oneoffs}/migrations/kustomization.yaml (66%) rename services/bstein-dev-home/{ => oneoffs}/migrations/portal-migrate-job.yaml (78%) rename services/bstein-dev-home/{ => oneoffs}/portal-onboarding-e2e-test-job.yaml (89%) rename services/comms/{ => oneoffs}/bstein-force-leave-job.yaml (96%) rename services/comms/{ => oneoffs}/comms-secrets-ensure-job.yaml (92%) rename services/comms/{ => oneoffs}/mas-admin-client-secret-ensure-job.yaml (90%) rename services/comms/{ => oneoffs}/mas-db-ensure-job.yaml (91%) rename services/comms/{ => oneoffs}/mas-local-users-ensure-job.yaml (97%) rename services/comms/{ => oneoffs}/othrys-kick-numeric-job.yaml (96%) rename services/comms/{ => oneoffs}/synapse-admin-ensure-job.yaml (96%) rename services/comms/{ => oneoffs}/synapse-seeder-admin-ensure-job.yaml (93%) rename services/comms/{ => oneoffs}/synapse-signingkey-ensure-job.yaml (88%) rename services/comms/{ => oneoffs}/synapse-user-seed-job.yaml (96%) rename services/finance/{ => oneoffs}/finance-secrets-ensure-job.yaml (83%) rename services/keycloak/{ => oneoffs}/actual-oidc-secret-ensure-job.yaml (83%) rename services/keycloak/{ => oneoffs}/harbor-oidc-secret-ensure-job.yaml (83%) rename services/keycloak/{ => oneoffs}/ldap-federation-job.yaml (98%) rename services/keycloak/{ => oneoffs}/logs-oidc-secret-ensure-job.yaml (94%) rename services/keycloak/{ => oneoffs}/mas-secrets-ensure-job.yaml (95%) rename services/keycloak/{ => oneoffs}/portal-admin-client-secret-ensure-job.yaml (96%) rename services/keycloak/{ => oneoffs}/portal-e2e-client-job.yaml (97%) rename services/keycloak/{ => oneoffs}/portal-e2e-execute-actions-email-test-job.yaml (89%) rename services/keycloak/{ => oneoffs}/portal-e2e-target-client-job.yaml (95%) rename services/keycloak/{ => oneoffs}/portal-e2e-token-exchange-permissions-job.yaml (97%) rename services/keycloak/{ => oneoffs}/portal-e2e-token-exchange-test-job.yaml (89%) rename services/keycloak/{ => oneoffs}/realm-settings-job.yaml (98%) rename services/keycloak/{ => oneoffs}/synapse-oidc-secret-ensure-job.yaml (92%) rename services/keycloak/{ => oneoffs}/user-overrides-job.yaml (96%) rename services/keycloak/{ => oneoffs}/vault-oidc-secret-ensure-job.yaml (83%) rename services/logging/{ => oneoffs}/opensearch-dashboards-setup-job.yaml (88%) rename services/logging/{ => oneoffs}/opensearch-ism-job.yaml (91%) rename services/logging/{ => oneoffs}/opensearch-observability-setup-job.yaml (76%) rename services/mailu/{ => oneoffs}/mailu-sync-job.yaml (93%) rename services/maintenance/{ => oneoffs}/ariadne-migrate-job.yaml (82%) rename services/maintenance/{ => oneoffs}/k3s-traefik-cleanup-job.yaml (77%) rename services/monitoring/{ => oneoffs}/grafana-org-bootstrap.yaml (93%) rename services/monitoring/{ => oneoffs}/grafana-user-dedupe-job.yaml (94%) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml index da61b2d..ff97f73 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml @@ -6,7 +6,7 @@ metadata: namespace: flux-system spec: interval: 10m - path: ./services/bstein-dev-home/migrations + path: ./services/bstein-dev-home/oneoffs/migrations prune: true force: true sourceRef: diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index a813241..f62fb17 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -16,7 +16,7 @@ resources: - backend-deployment.yaml - backend-service.yaml - vaultwarden-cred-sync-cronjob.yaml - - portal-onboarding-e2e-test-job.yaml + - oneoffs/portal-onboarding-e2e-test-job.yaml - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend diff --git a/services/bstein-dev-home/migrations/kustomization.yaml b/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml similarity index 66% rename from services/bstein-dev-home/migrations/kustomization.yaml rename to services/bstein-dev-home/oneoffs/migrations/kustomization.yaml index 067665b..1d1dfc8 100644 --- a/services/bstein-dev-home/migrations/kustomization.yaml +++ b/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml @@ -1,4 +1,4 @@ -# services/bstein-dev-home/migrations/kustomization.yaml +# services/bstein-dev-home/oneoffs/migrations/kustomization.yaml apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: bstein-dev-home diff --git a/services/bstein-dev-home/migrations/portal-migrate-job.yaml b/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml similarity index 78% rename from services/bstein-dev-home/migrations/portal-migrate-job.yaml rename to services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml index 9d05254..1f7e092 100644 --- a/services/bstein-dev-home/migrations/portal-migrate-job.yaml +++ b/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml @@ -1,4 +1,8 @@ -# services/bstein-dev-home/migrations/portal-migrate-job.yaml +# services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml +# One-off job for bstein-dev-home/bstein-dev-home-portal-migrate-36. +# Purpose: bstein dev home portal migrate 36 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: @@ -7,6 +11,7 @@ metadata: annotations: kustomize.toolkit.fluxcd.io/force: "true" spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml similarity index 89% rename from services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml rename to services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml index 681e89d..9923499 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml @@ -1,10 +1,15 @@ -# services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +# services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml +# One-off job for bstein-dev-home/portal-onboarding-e2e-test-27. +# Purpose: portal onboarding e2e test 27 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: portal-onboarding-e2e-test-27 namespace: bstein-dev-home spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml index 01d7be5..969ca58 100644 --- a/services/comms/kustomization.yaml +++ b/services/comms/kustomization.yaml @@ -22,24 +22,24 @@ resources: - mas-db-ensure-rbac.yaml - synapse-signingkey-ensure-rbac.yaml - vault-sync-deployment.yaml - - mas-admin-client-secret-ensure-job.yaml - - mas-db-ensure-job.yaml - - comms-secrets-ensure-job.yaml - - synapse-admin-ensure-job.yaml - - synapse-signingkey-ensure-job.yaml - - synapse-seeder-admin-ensure-job.yaml - - synapse-user-seed-job.yaml - - mas-local-users-ensure-job.yaml + - oneoffs/mas-admin-client-secret-ensure-job.yaml + - oneoffs/mas-db-ensure-job.yaml + - oneoffs/comms-secrets-ensure-job.yaml + - oneoffs/synapse-admin-ensure-job.yaml + - oneoffs/synapse-signingkey-ensure-job.yaml + - oneoffs/synapse-seeder-admin-ensure-job.yaml + - oneoffs/synapse-user-seed-job.yaml + - oneoffs/mas-local-users-ensure-job.yaml - mas-deployment.yaml - livekit-token-deployment.yaml - livekit.yaml - coturn.yaml - seed-othrys-room.yaml - guest-name-job.yaml - - othrys-kick-numeric-job.yaml + - oneoffs/othrys-kick-numeric-job.yaml - pin-othrys-job.yaml - reset-othrys-room-job.yaml - - bstein-force-leave-job.yaml + - oneoffs/bstein-force-leave-job.yaml - livekit-ingress.yaml - livekit-middlewares.yaml - matrix-ingress.yaml diff --git a/services/comms/bstein-force-leave-job.yaml b/services/comms/oneoffs/bstein-force-leave-job.yaml similarity index 96% rename from services/comms/bstein-force-leave-job.yaml rename to services/comms/oneoffs/bstein-force-leave-job.yaml index 0286f8c..7efe826 100644 --- a/services/comms/bstein-force-leave-job.yaml +++ b/services/comms/oneoffs/bstein-force-leave-job.yaml @@ -1,10 +1,15 @@ -# services/comms/bstein-force-leave-job.yaml +# services/comms/oneoffs/bstein-force-leave-job.yaml +# One-off job for comms/bstein-leave-rooms-12. +# Purpose: bstein leave rooms 12 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: bstein-leave-rooms-12 namespace: comms spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/comms/comms-secrets-ensure-job.yaml b/services/comms/oneoffs/comms-secrets-ensure-job.yaml similarity index 92% rename from services/comms/comms-secrets-ensure-job.yaml rename to services/comms/oneoffs/comms-secrets-ensure-job.yaml index 52904cc..35ca73c 100644 --- a/services/comms/comms-secrets-ensure-job.yaml +++ b/services/comms/oneoffs/comms-secrets-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/comms-secrets-ensure-job.yaml +# services/comms/oneoffs/comms-secrets-ensure-job.yaml +# One-off job for comms/comms-secrets-ensure-7. +# Purpose: comms secrets ensure 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: comms-secrets-ensure-7 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/comms/mas-admin-client-secret-ensure-job.yaml b/services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml similarity index 90% rename from services/comms/mas-admin-client-secret-ensure-job.yaml rename to services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml index 7b05cca..e1d5458 100644 --- a/services/comms/mas-admin-client-secret-ensure-job.yaml +++ b/services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml @@ -1,4 +1,8 @@ -# services/comms/mas-admin-client-secret-ensure-job.yaml +# services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml +# One-off job for comms/mas-admin-client-secret-writer. +# Purpose: mas admin client secret writer (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: v1 kind: ServiceAccount metadata: @@ -41,6 +45,7 @@ metadata: name: mas-admin-client-secret-ensure-11 namespace: comms spec: + suspend: true backoffLimit: 2 template: spec: diff --git a/services/comms/mas-db-ensure-job.yaml b/services/comms/oneoffs/mas-db-ensure-job.yaml similarity index 91% rename from services/comms/mas-db-ensure-job.yaml rename to services/comms/oneoffs/mas-db-ensure-job.yaml index 56707a9..44137da 100644 --- a/services/comms/mas-db-ensure-job.yaml +++ b/services/comms/oneoffs/mas-db-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/mas-db-ensure-job.yaml +# services/comms/oneoffs/mas-db-ensure-job.yaml +# One-off job for comms/mas-db-ensure-22. +# Purpose: mas db ensure 22 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: mas-db-ensure-22 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 600 template: diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/oneoffs/mas-local-users-ensure-job.yaml similarity index 97% rename from services/comms/mas-local-users-ensure-job.yaml rename to services/comms/oneoffs/mas-local-users-ensure-job.yaml index 636ee5b..7b51072 100644 --- a/services/comms/mas-local-users-ensure-job.yaml +++ b/services/comms/oneoffs/mas-local-users-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/mas-local-users-ensure-job.yaml +# services/comms/oneoffs/mas-local-users-ensure-job.yaml +# One-off job for comms/mas-local-users-ensure-18. +# Purpose: mas local users ensure 18 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: mas-local-users-ensure-18 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/comms/othrys-kick-numeric-job.yaml b/services/comms/oneoffs/othrys-kick-numeric-job.yaml similarity index 96% rename from services/comms/othrys-kick-numeric-job.yaml rename to services/comms/oneoffs/othrys-kick-numeric-job.yaml index 0d3914a..e38a6bb 100644 --- a/services/comms/othrys-kick-numeric-job.yaml +++ b/services/comms/oneoffs/othrys-kick-numeric-job.yaml @@ -1,10 +1,15 @@ -# services/comms/othrys-kick-numeric-job.yaml +# services/comms/oneoffs/othrys-kick-numeric-job.yaml +# One-off job for comms/othrys-kick-numeric-8. +# Purpose: othrys kick numeric 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: othrys-kick-numeric-8 namespace: comms spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/oneoffs/synapse-admin-ensure-job.yaml similarity index 96% rename from services/comms/synapse-admin-ensure-job.yaml rename to services/comms/oneoffs/synapse-admin-ensure-job.yaml index 5ddf60c..95bc9f2 100644 --- a/services/comms/synapse-admin-ensure-job.yaml +++ b/services/comms/oneoffs/synapse-admin-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/synapse-admin-ensure-job.yaml +# services/comms/oneoffs/synapse-admin-ensure-job.yaml +# One-off job for comms/synapse-admin-ensure-3. +# Purpose: synapse admin ensure 3 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: synapse-admin-ensure-3 namespace: comms spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml similarity index 93% rename from services/comms/synapse-seeder-admin-ensure-job.yaml rename to services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml index 5d2d422..1d8972e 100644 --- a/services/comms/synapse-seeder-admin-ensure-job.yaml +++ b/services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/synapse-seeder-admin-ensure-job.yaml +# services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml +# One-off job for comms/synapse-seeder-admin-ensure-9. +# Purpose: synapse seeder admin ensure 9 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: synapse-seeder-admin-ensure-9 namespace: comms spec: + suspend: true backoffLimit: 2 template: metadata: diff --git a/services/comms/synapse-signingkey-ensure-job.yaml b/services/comms/oneoffs/synapse-signingkey-ensure-job.yaml similarity index 88% rename from services/comms/synapse-signingkey-ensure-job.yaml rename to services/comms/oneoffs/synapse-signingkey-ensure-job.yaml index 402a820..bbc4595 100644 --- a/services/comms/synapse-signingkey-ensure-job.yaml +++ b/services/comms/oneoffs/synapse-signingkey-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/synapse-signingkey-ensure-job.yaml +# services/comms/oneoffs/synapse-signingkey-ensure-job.yaml +# One-off job for comms/othrys-synapse-signingkey-ensure-7. +# Purpose: othrys synapse signingkey ensure 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: othrys-synapse-signingkey-ensure-7 namespace: comms spec: + suspend: true backoffLimit: 2 template: spec: diff --git a/services/comms/synapse-user-seed-job.yaml b/services/comms/oneoffs/synapse-user-seed-job.yaml similarity index 96% rename from services/comms/synapse-user-seed-job.yaml rename to services/comms/oneoffs/synapse-user-seed-job.yaml index aab88c3..a732739 100644 --- a/services/comms/synapse-user-seed-job.yaml +++ b/services/comms/oneoffs/synapse-user-seed-job.yaml @@ -1,10 +1,15 @@ -# services/comms/synapse-user-seed-job.yaml +# services/comms/oneoffs/synapse-user-seed-job.yaml +# One-off job for comms/synapse-user-seed-8. +# Purpose: synapse user seed 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: synapse-user-seed-8 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/finance/kustomization.yaml b/services/finance/kustomization.yaml index e4c414f..1559f5c 100644 --- a/services/finance/kustomization.yaml +++ b/services/finance/kustomization.yaml @@ -9,7 +9,7 @@ resources: - finance-secrets-ensure-rbac.yaml - actual-budget-data-pvc.yaml - firefly-storage-pvc.yaml - - finance-secrets-ensure-job.yaml + - oneoffs/finance-secrets-ensure-job.yaml - actual-budget-deployment.yaml - firefly-deployment.yaml - firefly-user-sync-cronjob.yaml diff --git a/services/finance/finance-secrets-ensure-job.yaml b/services/finance/oneoffs/finance-secrets-ensure-job.yaml similarity index 83% rename from services/finance/finance-secrets-ensure-job.yaml rename to services/finance/oneoffs/finance-secrets-ensure-job.yaml index 67f06cb..e8c8f58 100644 --- a/services/finance/finance-secrets-ensure-job.yaml +++ b/services/finance/oneoffs/finance-secrets-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/finance/finance-secrets-ensure-job.yaml +# services/finance/oneoffs/finance-secrets-ensure-job.yaml +# One-off job for finance/finance-secrets-ensure-5. +# Purpose: finance secrets ensure 5 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: finance-secrets-ensure-5 namespace: finance spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/kustomization.yaml b/services/keycloak/kustomization.yaml index 6030a82..6027891 100644 --- a/services/keycloak/kustomization.yaml +++ b/services/keycloak/kustomization.yaml @@ -10,21 +10,21 @@ resources: - secretproviderclass.yaml - vault-sync-deployment.yaml - deployment.yaml - - realm-settings-job.yaml - - portal-admin-client-secret-ensure-job.yaml - - portal-e2e-client-job.yaml - - portal-e2e-target-client-job.yaml - - portal-e2e-token-exchange-permissions-job.yaml - - portal-e2e-token-exchange-test-job.yaml - - portal-e2e-execute-actions-email-test-job.yaml - - ldap-federation-job.yaml - - user-overrides-job.yaml - - mas-secrets-ensure-job.yaml - - synapse-oidc-secret-ensure-job.yaml - - logs-oidc-secret-ensure-job.yaml - - harbor-oidc-secret-ensure-job.yaml - - vault-oidc-secret-ensure-job.yaml - - actual-oidc-secret-ensure-job.yaml + - oneoffs/realm-settings-job.yaml + - oneoffs/portal-admin-client-secret-ensure-job.yaml + - oneoffs/portal-e2e-client-job.yaml + - oneoffs/portal-e2e-target-client-job.yaml + - oneoffs/portal-e2e-token-exchange-permissions-job.yaml + - oneoffs/portal-e2e-token-exchange-test-job.yaml + - oneoffs/portal-e2e-execute-actions-email-test-job.yaml + - oneoffs/ldap-federation-job.yaml + - oneoffs/user-overrides-job.yaml + - oneoffs/mas-secrets-ensure-job.yaml + - oneoffs/synapse-oidc-secret-ensure-job.yaml + - oneoffs/logs-oidc-secret-ensure-job.yaml + - oneoffs/harbor-oidc-secret-ensure-job.yaml + - oneoffs/vault-oidc-secret-ensure-job.yaml + - oneoffs/actual-oidc-secret-ensure-job.yaml - service.yaml - ingress.yaml generatorOptions: diff --git a/services/keycloak/actual-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml similarity index 83% rename from services/keycloak/actual-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml index 3dadb52..d4da1f1 100644 --- a/services/keycloak/actual-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/actual-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml +# One-off job for sso/actual-oidc-secret-ensure-3. +# Purpose: actual oidc secret ensure 3 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: actual-oidc-secret-ensure-3 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/harbor-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml similarity index 83% rename from services/keycloak/harbor-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml index 87de463..c368241 100644 --- a/services/keycloak/harbor-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/harbor-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml +# One-off job for sso/harbor-oidc-secret-ensure-10. +# Purpose: harbor oidc secret ensure 10 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: harbor-oidc-secret-ensure-10 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/ldap-federation-job.yaml b/services/keycloak/oneoffs/ldap-federation-job.yaml similarity index 98% rename from services/keycloak/ldap-federation-job.yaml rename to services/keycloak/oneoffs/ldap-federation-job.yaml index 3c3f1c1..9e9a5f9 100644 --- a/services/keycloak/ldap-federation-job.yaml +++ b/services/keycloak/oneoffs/ldap-federation-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/ldap-federation-job.yaml +# services/keycloak/oneoffs/ldap-federation-job.yaml +# One-off job for sso/keycloak-ldap-federation-12. +# Purpose: keycloak ldap federation 12 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-ldap-federation-12 namespace: sso spec: + suspend: true backoffLimit: 2 template: metadata: diff --git a/services/keycloak/logs-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml similarity index 94% rename from services/keycloak/logs-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml index 14e80df..bce9e5b 100644 --- a/services/keycloak/logs-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/logs-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml +# One-off job for sso/logs-oidc-secret-ensure-10. +# Purpose: logs oidc secret ensure 10 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: logs-oidc-secret-ensure-10 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/mas-secrets-ensure-job.yaml b/services/keycloak/oneoffs/mas-secrets-ensure-job.yaml similarity index 95% rename from services/keycloak/mas-secrets-ensure-job.yaml rename to services/keycloak/oneoffs/mas-secrets-ensure-job.yaml index 24c9e04..c3bd1be 100644 --- a/services/keycloak/mas-secrets-ensure-job.yaml +++ b/services/keycloak/oneoffs/mas-secrets-ensure-job.yaml @@ -1,4 +1,8 @@ -# services/keycloak/mas-secrets-ensure-job.yaml +# services/keycloak/oneoffs/mas-secrets-ensure-job.yaml +# One-off job for sso/mas-secrets-ensure. +# Purpose: mas secrets ensure (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: v1 kind: ServiceAccount metadata: @@ -13,6 +17,7 @@ metadata: name: mas-secrets-ensure-21 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/portal-admin-client-secret-ensure-job.yaml b/services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml similarity index 96% rename from services/keycloak/portal-admin-client-secret-ensure-job.yaml rename to services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml index 90dd4b7..1d3e7f3 100644 --- a/services/keycloak/portal-admin-client-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-admin-client-secret-ensure-job.yaml +# services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml +# One-off job for sso/keycloak-portal-admin-secret-ensure-4. +# Purpose: keycloak portal admin secret ensure 4 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-admin-secret-ensure-4 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/portal-e2e-client-job.yaml b/services/keycloak/oneoffs/portal-e2e-client-job.yaml similarity index 97% rename from services/keycloak/portal-e2e-client-job.yaml rename to services/keycloak/oneoffs/portal-e2e-client-job.yaml index 4e0c006..274dd27 100644 --- a/services/keycloak/portal-e2e-client-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-client-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-client-job.yaml +# services/keycloak/oneoffs/portal-e2e-client-job.yaml +# One-off job for sso/keycloak-portal-e2e-client-8. +# Purpose: keycloak portal e2e client 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-client-8 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml similarity index 89% rename from services/keycloak/portal-e2e-execute-actions-email-test-job.yaml rename to services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml index 35f79a6..518d839 100644 --- a/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-execute-actions-email-test-job.yaml +# services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml +# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14. +# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-execute-actions-email-14 namespace: sso spec: + suspend: true backoffLimit: 3 template: metadata: diff --git a/services/keycloak/portal-e2e-target-client-job.yaml b/services/keycloak/oneoffs/portal-e2e-target-client-job.yaml similarity index 95% rename from services/keycloak/portal-e2e-target-client-job.yaml rename to services/keycloak/oneoffs/portal-e2e-target-client-job.yaml index 196b48b..900d029 100644 --- a/services/keycloak/portal-e2e-target-client-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-target-client-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-target-client-job.yaml +# services/keycloak/oneoffs/portal-e2e-target-client-job.yaml +# One-off job for sso/keycloak-portal-e2e-target-7. +# Purpose: keycloak portal e2e target 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-target-7 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml b/services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml similarity index 97% rename from services/keycloak/portal-e2e-token-exchange-permissions-job.yaml rename to services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml index 647b8f9..0d41b47 100644 --- a/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-token-exchange-permissions-job.yaml +# services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml +# One-off job for sso/keycloak-portal-e2e-token-exchange-permissions-11. +# Purpose: keycloak portal e2e token exchange permissions 11 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-token-exchange-permissions-11 namespace: sso spec: + suspend: true backoffLimit: 6 template: metadata: diff --git a/services/keycloak/portal-e2e-token-exchange-test-job.yaml b/services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml similarity index 89% rename from services/keycloak/portal-e2e-token-exchange-test-job.yaml rename to services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml index edd7555..eb05e09 100644 --- a/services/keycloak/portal-e2e-token-exchange-test-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-token-exchange-test-job.yaml +# services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml +# One-off job for sso/keycloak-portal-e2e-token-exchange-test-7. +# Purpose: keycloak portal e2e token exchange test 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-token-exchange-test-7 namespace: sso spec: + suspend: true backoffLimit: 6 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/oneoffs/realm-settings-job.yaml similarity index 98% rename from services/keycloak/realm-settings-job.yaml rename to services/keycloak/oneoffs/realm-settings-job.yaml index 9265ca3..ea88d83 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/oneoffs/realm-settings-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/realm-settings-job.yaml +# services/keycloak/oneoffs/realm-settings-job.yaml +# One-off job for sso/keycloak-realm-settings-36. +# Purpose: keycloak realm settings 36 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-realm-settings-36 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/synapse-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml similarity index 92% rename from services/keycloak/synapse-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml index e808e7e..15b7a31 100644 --- a/services/keycloak/synapse-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/synapse-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml +# One-off job for sso/synapse-oidc-secret-ensure-10. +# Purpose: synapse oidc secret ensure 10 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: synapse-oidc-secret-ensure-10 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/user-overrides-job.yaml b/services/keycloak/oneoffs/user-overrides-job.yaml similarity index 96% rename from services/keycloak/user-overrides-job.yaml rename to services/keycloak/oneoffs/user-overrides-job.yaml index 7623c84..0d52d6d 100644 --- a/services/keycloak/user-overrides-job.yaml +++ b/services/keycloak/oneoffs/user-overrides-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/user-overrides-job.yaml +# services/keycloak/oneoffs/user-overrides-job.yaml +# One-off job for sso/keycloak-user-overrides-9. +# Purpose: keycloak user overrides 9 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-user-overrides-9 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/vault-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml similarity index 83% rename from services/keycloak/vault-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml index 3aa3ca5..a76c52e 100644 --- a/services/keycloak/vault-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/vault-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml +# One-off job for sso/vault-oidc-secret-ensure-8. +# Purpose: vault oidc secret ensure 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: vault-oidc-secret-ensure-8 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/logging/kustomization.yaml b/services/logging/kustomization.yaml index 08c73a8..dc48715 100644 --- a/services/logging/kustomization.yaml +++ b/services/logging/kustomization.yaml @@ -15,9 +15,9 @@ resources: - opensearch-dashboards-helmrelease.yaml - data-prepper-helmrelease.yaml - otel-collector-helmrelease.yaml - - opensearch-ism-job.yaml - - opensearch-dashboards-setup-job.yaml - - opensearch-observability-setup-job.yaml + - oneoffs/opensearch-ism-job.yaml + - oneoffs/opensearch-dashboards-setup-job.yaml + - oneoffs/opensearch-observability-setup-job.yaml - opensearch-prune-cronjob.yaml - fluent-bit-helmrelease.yaml - node-log-rotation-daemonset.yaml diff --git a/services/logging/opensearch-dashboards-setup-job.yaml b/services/logging/oneoffs/opensearch-dashboards-setup-job.yaml similarity index 88% rename from services/logging/opensearch-dashboards-setup-job.yaml rename to services/logging/oneoffs/opensearch-dashboards-setup-job.yaml index 06149d7..1d1a9b6 100644 --- a/services/logging/opensearch-dashboards-setup-job.yaml +++ b/services/logging/oneoffs/opensearch-dashboards-setup-job.yaml @@ -1,10 +1,15 @@ -# services/logging/opensearch-dashboards-setup-job.yaml +# services/logging/oneoffs/opensearch-dashboards-setup-job.yaml +# One-off job for logging/opensearch-dashboards-setup-4. +# Purpose: opensearch dashboards setup 4 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: opensearch-dashboards-setup-4 namespace: logging spec: + suspend: true backoffLimit: 3 ttlSecondsAfterFinished: 3600 template: diff --git a/services/logging/opensearch-ism-job.yaml b/services/logging/oneoffs/opensearch-ism-job.yaml similarity index 91% rename from services/logging/opensearch-ism-job.yaml rename to services/logging/oneoffs/opensearch-ism-job.yaml index 3313571..476bca7 100644 --- a/services/logging/opensearch-ism-job.yaml +++ b/services/logging/oneoffs/opensearch-ism-job.yaml @@ -1,10 +1,15 @@ -# services/logging/opensearch-ism-job.yaml +# services/logging/oneoffs/opensearch-ism-job.yaml +# One-off job for logging/opensearch-ism-setup-5. +# Purpose: opensearch ism setup 5 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: opensearch-ism-setup-5 namespace: logging spec: + suspend: true backoffLimit: 3 ttlSecondsAfterFinished: 3600 template: diff --git a/services/logging/opensearch-observability-setup-job.yaml b/services/logging/oneoffs/opensearch-observability-setup-job.yaml similarity index 76% rename from services/logging/opensearch-observability-setup-job.yaml rename to services/logging/oneoffs/opensearch-observability-setup-job.yaml index e4590fb..6caa076 100644 --- a/services/logging/opensearch-observability-setup-job.yaml +++ b/services/logging/oneoffs/opensearch-observability-setup-job.yaml @@ -1,10 +1,15 @@ -# services/logging/opensearch-observability-setup-job.yaml +# services/logging/oneoffs/opensearch-observability-setup-job.yaml +# One-off job for logging/opensearch-observability-setup-2. +# Purpose: opensearch observability setup 2 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: opensearch-observability-setup-2 namespace: logging spec: + suspend: true backoffLimit: 3 ttlSecondsAfterFinished: 3600 template: diff --git a/services/mailu/kustomization.yaml b/services/mailu/kustomization.yaml index 7447f24..3e0494e 100644 --- a/services/mailu/kustomization.yaml +++ b/services/mailu/kustomization.yaml @@ -13,7 +13,7 @@ resources: - unbound-configmap.yaml - serverstransport.yaml - ingressroute.yaml - - mailu-sync-job.yaml + - oneoffs/mailu-sync-job.yaml - mailu-sync-cronjob.yaml - front-lb.yaml diff --git a/services/mailu/mailu-sync-job.yaml b/services/mailu/oneoffs/mailu-sync-job.yaml similarity index 93% rename from services/mailu/mailu-sync-job.yaml rename to services/mailu/oneoffs/mailu-sync-job.yaml index 8589e9e..38648ac 100644 --- a/services/mailu/mailu-sync-job.yaml +++ b/services/mailu/oneoffs/mailu-sync-job.yaml @@ -1,10 +1,15 @@ -# services/mailu/mailu-sync-job.yaml +# services/mailu/oneoffs/mailu-sync-job.yaml +# One-off job for mailu-mailserver/mailu-sync-9. +# Purpose: mailu sync 9 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: mailu-sync-9 namespace: mailu-mailserver spec: + suspend: true template: metadata: annotations: diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index a1ca583..19b2ba9 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -14,10 +14,10 @@ resources: - node-nofile-serviceaccount.yaml - pod-cleaner-rbac.yaml - ariadne-deployment.yaml - - ariadne-migrate-job.yaml + - oneoffs/ariadne-migrate-job.yaml - ariadne-service.yaml - disable-k3s-traefik-daemonset.yaml - - k3s-traefik-cleanup-job.yaml + - oneoffs/k3s-traefik-cleanup-job.yaml - node-nofile-daemonset.yaml - k3s-agent-restart-daemonset.yaml - pod-cleaner-cronjob.yaml diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/oneoffs/ariadne-migrate-job.yaml similarity index 82% rename from services/maintenance/ariadne-migrate-job.yaml rename to services/maintenance/oneoffs/ariadne-migrate-job.yaml index 367a1a0..ecac68d 100644 --- a/services/maintenance/ariadne-migrate-job.yaml +++ b/services/maintenance/oneoffs/ariadne-migrate-job.yaml @@ -1,4 +1,8 @@ -# services/maintenance/ariadne-migrate-job.yaml +# services/maintenance/oneoffs/ariadne-migrate-job.yaml +# One-off job for maintenance/ariadne-migrate-2. +# Purpose: ariadne migrate 2 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: diff --git a/services/maintenance/k3s-traefik-cleanup-job.yaml b/services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml similarity index 77% rename from services/maintenance/k3s-traefik-cleanup-job.yaml rename to services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml index d5d12a6..2c365a9 100644 --- a/services/maintenance/k3s-traefik-cleanup-job.yaml +++ b/services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml @@ -1,10 +1,15 @@ -# services/maintenance/k3s-traefik-cleanup-job.yaml +# services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml +# One-off job for maintenance/k3s-traefik-cleanup-2. +# Purpose: k3s traefik cleanup 2 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: k3s-traefik-cleanup-2 namespace: maintenance spec: + suspend: true backoffLimit: 1 template: spec: diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 5953039..23c1595 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -23,8 +23,8 @@ resources: - grafana-alerting-config.yaml - grafana-folders.yaml - helmrelease.yaml - - grafana-org-bootstrap.yaml - - grafana-user-dedupe-job.yaml + - oneoffs/grafana-org-bootstrap.yaml + - oneoffs/grafana-user-dedupe-job.yaml configMapGenerator: - name: postmark-exporter-script diff --git a/services/monitoring/grafana-org-bootstrap.yaml b/services/monitoring/oneoffs/grafana-org-bootstrap.yaml similarity index 93% rename from services/monitoring/grafana-org-bootstrap.yaml rename to services/monitoring/oneoffs/grafana-org-bootstrap.yaml index f1d4075..6f824cc 100644 --- a/services/monitoring/grafana-org-bootstrap.yaml +++ b/services/monitoring/oneoffs/grafana-org-bootstrap.yaml @@ -1,10 +1,15 @@ -# services/monitoring/grafana-org-bootstrap.yaml +# services/monitoring/oneoffs/grafana-org-bootstrap.yaml +# One-off job for monitoring/grafana-org-bootstrap-3. +# Purpose: grafana org bootstrap 3 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: grafana-org-bootstrap-3 namespace: monitoring spec: + suspend: true backoffLimit: 2 template: metadata: diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml similarity index 94% rename from services/monitoring/grafana-user-dedupe-job.yaml rename to services/monitoring/oneoffs/grafana-user-dedupe-job.yaml index 8ab1a66..8194f18 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml @@ -1,10 +1,15 @@ -# services/monitoring/grafana-user-dedupe-job.yaml +# services/monitoring/oneoffs/grafana-user-dedupe-job.yaml +# One-off job for monitoring/grafana-user-dedupe-api-v7. +# Purpose: grafana user dedupe api v7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: grafana-user-dedupe-api-v7 namespace: monitoring spec: + suspend: true backoffLimit: 1 template: metadata: From c1e94d56c857ff127650ddba67edd386b9f49400 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:52:23 -0300 Subject: [PATCH 395/416] atlasbot: simplify fast path --- services/comms/scripts/atlasbot/bot.py | 32 ++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 50fed4b..d0d46ef 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3605,10 +3605,33 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: if mode == "fast": - return 4 + return 2 return 7 +def _open_ended_fast_single( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + state: ThoughtState | None = None, + model: str, +) -> str: + if state: + state.update("drafting", step=2, note="summarizing") + context = fact_pack + reply = _ollama_call( + ("atlasbot_fast", "atlasbot_fast"), + prompt, + context=context, + use_history=False, + model=model, + ) + if state: + state.update("done", step=_open_ended_total_steps("fast")) + return _ensure_scores(reply) + + def _open_ended_fast( prompt: str, *, @@ -3618,14 +3641,13 @@ def _open_ended_fast( history_lines: list[str], state: ThoughtState | None = None, ) -> str: - return _open_ended_multi( + model = _model_for_mode("fast") + return _open_ended_fast_single( prompt, fact_pack=fact_pack, - fact_lines=fact_lines, - fact_meta=fact_meta, history_lines=history_lines, - mode="fast", state=state, + model=model, ) From 2a2179a1384d0e2484ca53f8c3ba9768df40de3e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:52:40 -0300 Subject: [PATCH 396/416] comms: roll atlasbot config --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 1d89335..8607858 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-87 + checksum/atlasbot-configmap: manual-atlasbot-88 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 62e0a565f54215f9c69b41b118008cb2f44e4dbe Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:58:07 -0300 Subject: [PATCH 397/416] atlasbot: tighten fast facts --- services/comms/scripts/atlasbot/bot.py | 39 +++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index d0d46ef..c44c7da 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3605,10 +3605,25 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: if mode == "fast": - return 2 + return 3 return 7 +def _fast_fact_lines( + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + fact_ids: list[str], +) -> list[str]: + if not fact_ids: + return fact_lines + selected = [ + line + for line in fact_lines + if fact_meta.get(line, {}).get("id") in set(fact_ids) + ] + return selected or fact_lines + + def _open_ended_fast_single( prompt: str, *, @@ -3642,6 +3657,27 @@ def _open_ended_fast( state: ThoughtState | None = None, ) -> str: model = _model_for_mode("fast") + if state: + state.update("selecting", step=2, note="picking key facts") + subjective = _is_subjective_query(prompt) + focus_tags = _preferred_tags_for_prompt(prompt) + if not focus_tags and subjective: + focus_tags = set(_ALLOWED_INSIGHT_TAGS) + primary_ids = _open_ended_select_facts( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + focus_tags=focus_tags, + avoid_fact_ids=[], + count=3, + subjective=subjective, + state=state, + step=2, + model=model, + ) + selected_lines = _fast_fact_lines(fact_lines, fact_meta, primary_ids) + fact_pack = _fact_pack_text(selected_lines, fact_meta) return _open_ended_fast_single( prompt, fact_pack=fact_pack, @@ -4092,6 +4128,7 @@ def _ollama_call( "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " "Translate metrics into natural language instead of echoing raw label/value pairs. " + "When providing counts or totals, use the exact numbers from the context; do not invent or truncate. " "Avoid bare lists unless the user asked for a list; weave numbers into sentences. " "Do not answer by only listing runbooks; if the question is about Atlas/Othrys, summarize the cluster first and mention docs only if useful. " "If the question is not about Atlas/Othrys and no cluster context is provided, answer using general knowledge and say when you are unsure. " From 769d3f41bf23972e155942f1c0e32b28afa57821 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:58:23 -0300 Subject: [PATCH 398/416] comms: roll atlasbot config --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 8607858..a7fbea9 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-88 + checksum/atlasbot-configmap: manual-atlasbot-89 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 666dcb3faa09243eb010c8aac081f4a1ea007d6a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 02:21:42 -0300 Subject: [PATCH 399/416] atlasbot: rework reasoning pipeline --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 286 +++++++++++++++++------- 2 files changed, 210 insertions(+), 78 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index a7fbea9..c9602c3 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-89 + checksum/atlasbot-configmap: manual-atlasbot-90 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index c44c7da..ffc8a5c 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -16,7 +16,7 @@ PASSWORD = os.environ["BOT_PASS"] ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") -MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") +MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5:14b-instruct") MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "") MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "") FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "") @@ -2895,6 +2895,7 @@ def _open_ended_system() -> str: "Use ONLY the provided fact pack and recent chat as your evidence. " "You may draw light inferences if you label them as such. " "Write concise, human sentences with a helpful, calm tone (not a list). " + "Be willing to take a light stance; do not over-hedge. " "If the question is subjective (cool/interesting/unconventional), pick a standout fact and explain why it stands out. " "If the question asks for a list, embed the list inline in a sentence (comma-separated). " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " @@ -3002,20 +3003,27 @@ def _open_ended_plan( *, fact_pack: str, history_lines: list[str], + focus_tags: set[str], + avoid_tags: set[str], count: int, state: ThoughtState | None, + step: int, model: str | None, ) -> list[dict[str, Any]]: if state: - state.update("planning", step=1, note="mapping angles") + state.update("planning", step=step, note="mapping angles") count = max(1, count) + focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any" + avoid_hint = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" prompt_text = ( "Analyze the question and propose up to " f"{count} distinct answer angles that can be supported by the fact pack. " "Keep them diverse (e.g., metrics, hardware, workload placement, recent changes). " "If the question is subjective, propose at least one angle that surfaces a standout detail. " + f"Prefer angles that align with these tags: {focus_hint}. " + f"Avoid angles that overlap these tags if possible: {avoid_hint}. " "Avoid repeating the same angle as the most recent response if possible. " - "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"priority\":1-5}]}." + "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"tags\":[\"tag\"],\"priority\":1-5}]}." ) context = _append_history_context(fact_pack, history_lines) result = _ollama_json_call( @@ -3037,10 +3045,12 @@ def _open_ended_plan( priority = item.get("priority") if not isinstance(priority, (int, float)): priority = 3 + tags = _sanitize_focus_tags(item.get("tags") or []) cleaned.append( { "focus": focus, "reason": str(item.get("reason") or ""), + "tags": tags, "priority": int(max(1, min(5, priority))), } ) @@ -3131,6 +3141,35 @@ def _preferred_tags_for_prompt(prompt: str) -> set[str]: return tags & _ALLOWED_INSIGHT_TAGS +_TAG_KEYWORDS: dict[str, tuple[str, ...]] = { + "utilization": ("cpu", "ram", "memory", "net", "network", "io", "disk", "usage", "utilization", "hottest", "busy"), + "database": ("postgres", "db", "database", "connections"), + "pods": ("pod", "pods", "deployment", "daemonset", "job", "cron", "workload"), + "hardware": ("hardware", "architecture", "arch", "rpi", "raspberry", "jetson", "amd64", "arm64", "node", "nodes"), + "availability": ("ready", "not ready", "unready", "down", "missing"), + "workloads": ("workload", "service", "namespace", "app"), + "os": ("os", "kernel", "kubelet", "containerd", "runtime"), +} + + +def _tags_from_text(text: str) -> set[str]: + q = normalize_query(text) + if not q: + return set() + tags: set[str] = set() + for tag, keywords in _TAG_KEYWORDS.items(): + if any(word in q for word in keywords): + tags.add(tag) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _history_focus_tags(history_lines: list[str]) -> set[str]: + if not history_lines: + return set() + recent = " ".join(line for line in history_lines[-6:] if isinstance(line, str)) + return _tags_from_text(recent) + + def _open_ended_insights( prompt: str, *, @@ -3139,10 +3178,11 @@ def _open_ended_insights( history_lines: list[str], count: int, state: ThoughtState | None, + step: int, model: str | None, ) -> list[dict[str, Any]]: if state: - state.update("analyzing", note="scouting insights") + state.update("analyzing", step=step, note="scouting insights") count = max(1, count) allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS)) prompt_text = ( @@ -3188,10 +3228,35 @@ def _open_ended_insights( return cleaned +def _rank_insights( + insights: list[dict[str, Any]], + *, + focus_tags: set[str], + avoid_tags: set[str], + count: int, +) -> list[dict[str, Any]]: + if not insights: + return [] + ranked: list[tuple[float, dict[str, Any]]] = [] + for insight in insights: + relevance = _normalize_fraction(insight.get("relevance"), default=0.6) + novelty = _normalize_fraction(insight.get("novelty"), default=0.5) + tags = set(insight.get("tags") or []) + score = relevance * 0.65 + novelty * 0.35 + if focus_tags and tags & focus_tags: + score += 0.1 + if avoid_tags and tags & avoid_tags: + score -= 0.2 + ranked.append((score, insight)) + ranked.sort(key=lambda item: item[0], reverse=True) + return [item for _, item in ranked[:count]] + + def _fallback_fact_ids( fact_meta: dict[str, dict[str, Any]], *, focus_tags: set[str], + avoid_tags: set[str], count: int, ) -> list[str]: if not fact_meta: @@ -3202,9 +3267,16 @@ def _fallback_fact_ids( for fid, meta in fact_meta.items() if focus_tags & set(meta.get("tags") or []) ] + if avoid_tags: + tagged = [fid for fid in tagged if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))] if tagged: return tagged[:count] - return list(fact_meta.keys())[:count] + all_ids = list(fact_meta.keys()) + if avoid_tags: + filtered = [fid for fid in all_ids if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))] + if filtered: + return filtered[:count] + return all_ids[:count] def _open_ended_select_facts( @@ -3214,6 +3286,7 @@ def _open_ended_select_facts( fact_meta: dict[str, dict[str, Any]], history_lines: list[str], focus_tags: set[str], + avoid_tags: set[str], avoid_fact_ids: list[str], count: int, subjective: bool, @@ -3224,11 +3297,13 @@ def _open_ended_select_facts( if state: state.update("selecting facts", step=step, note="picking evidence") focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any" + avoid_tag_hint = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" avoid_hint = ", ".join(avoid_fact_ids) if avoid_fact_ids else "none" prompt_text = ( "Select the fact IDs that best answer the question. " f"Pick up to {count} fact IDs. " f"Focus tags: {focus_hint}. " + f"Avoid these tags if possible: {avoid_tag_hint}. " f"Avoid these fact IDs: {avoid_hint}. " "If the question is subjective, pick standout or unusual facts; " "if objective, pick the minimal facts needed. " @@ -3248,7 +3323,18 @@ def _open_ended_select_facts( selected.append(fid) if len(selected) >= count: break - seed = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count) + if avoid_tags: + selected = [ + fid + for fid in selected + if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or [])) + ] or selected + seed = _fallback_fact_ids( + fact_meta, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + count=count, + ) if selected: for fid in seed: if fid not in selected: @@ -3483,7 +3569,7 @@ def _open_ended_multi( if mode == "fast": total_steps = 4 else: - total_steps = 7 + total_steps = 9 if state: state.total_steps = total_steps @@ -3503,41 +3589,25 @@ def _open_ended_multi( focus_tags = set(interpretation.get("focus_tags") or []) or _preferred_tags_for_prompt(prompt) if not focus_tags and subjective: focus_tags = set(_ALLOWED_INSIGHT_TAGS) + avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set() - primary_ids = _open_ended_select_facts( - prompt, - fact_pack=fact_pack, - fact_meta=fact_meta, - history_lines=history_lines, - focus_tags=focus_tags, - avoid_fact_ids=[], - count=4 if mode == "deep" else 3, - subjective=subjective, - state=state, - step=2, - model=model, - ) - alternate_ids: list[str] = [] - if mode == "deep": - alternate_ids = _open_ended_select_facts( + if mode == "fast": + primary_ids = _open_ended_select_facts( prompt, fact_pack=fact_pack, fact_meta=fact_meta, history_lines=history_lines, focus_tags=focus_tags, - avoid_fact_ids=primary_ids, - count=4, + avoid_tags=avoid_tags, + avoid_fact_ids=[], + count=3, subjective=subjective, state=state, - step=3, + step=2, model=model, ) - - candidates: list[dict[str, Any]] = [] - focus_label = interpretation.get("focus_label") or "primary angle" - step = 3 if mode == "fast" else 4 - candidates.append( - _open_ended_candidate( + focus_label = interpretation.get("focus_label") or "primary angle" + candidate = _open_ended_candidate( prompt, focus=str(focus_label), fact_pack=fact_pack, @@ -3546,17 +3616,65 @@ def _open_ended_multi( tone=str(tone), allow_list=allow_list, state=state, - step=step, + step=3, fact_hints=primary_ids, model=model, ) + reply = _open_ended_synthesize( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + candidates=[candidate], + subjective=subjective, + tone=str(tone), + allow_list=allow_list, + state=state, + step=4, + model=model, + critique=None, + ) + if state: + state.update("done", step=total_steps) + return reply + + angles = _open_ended_plan( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + count=5, + state=state, + step=2, + model=model, ) - step += 1 - if mode == "deep" and alternate_ids: + if state and avoid_tags: + state.update("planning", step=2, note=f"avoiding {', '.join(sorted(avoid_tags))}") + + insights = _open_ended_insights( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + count=7, + state=state, + step=3, + model=model, + ) + ranked_insights = _rank_insights( + insights, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + count=3, + ) + + candidates: list[dict[str, Any]] = [] + step = 4 + for insight in ranked_insights: candidates.append( _open_ended_candidate( prompt, - focus="alternate angle", + focus=insight.get("summary") or "insight", fact_pack=fact_pack, history_lines=history_lines, subjective=subjective, @@ -3564,27 +3682,61 @@ def _open_ended_multi( allow_list=allow_list, state=state, step=step, - fact_hints=alternate_ids, + fact_hints=insight.get("fact_ids") or [], model=model, ) ) step += 1 + if not candidates and angles: + for angle in angles[:2]: + angle_tags = set(angle.get("tags") or []) or _tags_from_text(angle.get("focus") or "") + fact_ids = _open_ended_select_facts( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + focus_tags=angle_tags or focus_tags, + avoid_tags=avoid_tags, + avoid_fact_ids=[], + count=4, + subjective=subjective, + state=state, + step=step, + model=model, + ) + candidates.append( + _open_ended_candidate( + prompt, + focus=angle.get("focus") or "alternate angle", + fact_pack=fact_pack, + history_lines=history_lines, + subjective=subjective, + tone=str(tone), + allow_list=allow_list, + state=state, + step=step, + fact_hints=fact_ids, + model=model, + ) + ) + step += 1 + if len(candidates) >= 2: + break + if state: state.update("evaluating", step=step, note="ranking candidates") - selected = _select_candidates(candidates, count=1 if mode == "fast" else 2) + selected = _select_candidates(candidates, count=2) + step += 1 + critique = _open_ended_critique( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + candidates=selected or candidates, + state=state, + step=step, + model=model, + ) step += 1 - critique = "" - if mode == "deep": - critique = _open_ended_critique( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - candidates=selected or candidates, - state=state, - step=step, - model=model, - ) - step += 1 reply = _open_ended_synthesize( prompt, fact_pack=fact_pack, @@ -3605,8 +3757,8 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: if mode == "fast": - return 3 - return 7 + return 4 + return 9 def _fast_fact_lines( @@ -3656,34 +3808,14 @@ def _open_ended_fast( history_lines: list[str], state: ThoughtState | None = None, ) -> str: - model = _model_for_mode("fast") - if state: - state.update("selecting", step=2, note="picking key facts") - subjective = _is_subjective_query(prompt) - focus_tags = _preferred_tags_for_prompt(prompt) - if not focus_tags and subjective: - focus_tags = set(_ALLOWED_INSIGHT_TAGS) - primary_ids = _open_ended_select_facts( + return _open_ended_multi( prompt, fact_pack=fact_pack, + fact_lines=fact_lines, fact_meta=fact_meta, history_lines=history_lines, - focus_tags=focus_tags, - avoid_fact_ids=[], - count=3, - subjective=subjective, + mode="fast", state=state, - step=2, - model=model, - ) - selected_lines = _fast_fact_lines(fact_lines, fact_meta, primary_ids) - fact_pack = _fact_pack_text(selected_lines, fact_meta) - return _open_ended_fast_single( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - state=state, - model=model, ) @@ -3846,7 +3978,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): if cluster_query: context = build_context( cleaned, - allow_tools=False, + allow_tools=True, targets=[], inventory=inventory, snapshot=snapshot, @@ -3860,7 +3992,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): workloads=workloads, history_lines=history_lines, mode=mode, - allow_tools=False, + allow_tools=True, state=None, ) else: From 349a46ceab3a4b9396883eadb015824a8c4d1c49 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 02:43:24 -0300 Subject: [PATCH 400/416] comms: tune atlasbot quick model --- services/comms/atlasbot-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index c9602c3..d570fd9 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-90 + checksum/atlasbot-configmap: manual-atlasbot-91 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -84,7 +84,7 @@ spec: - name: OLLAMA_MODEL value: qwen2.5:14b-instruct - name: ATLASBOT_MODEL_FAST - value: qwen2.5:14b-instruct + value: qwen2.5:7b-instruct-q4_0 - name: ATLASBOT_MODEL_DEEP value: qwen2.5:14b-instruct - name: OLLAMA_FALLBACK_MODEL From 08ac598181aa636d8789d7a9758852173b62c579 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 02:53:43 -0300 Subject: [PATCH 401/416] atlasbot: streamline quick answers --- services/comms/scripts/atlasbot/bot.py | 120 ++++++++++--------------- 1 file changed, 47 insertions(+), 73 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index ffc8a5c..6f18b9e 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3562,14 +3562,10 @@ def _open_ended_multi( fact_lines: list[str], fact_meta: dict[str, dict[str, Any]], history_lines: list[str], - mode: str, state: ThoughtState | None = None, ) -> str: - model = _model_for_mode(mode) - if mode == "fast": - total_steps = 4 - else: - total_steps = 9 + model = _model_for_mode("deep") + total_steps = _open_ended_total_steps("deep") if state: state.total_steps = total_steps @@ -3591,52 +3587,6 @@ def _open_ended_multi( focus_tags = set(_ALLOWED_INSIGHT_TAGS) avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set() - if mode == "fast": - primary_ids = _open_ended_select_facts( - prompt, - fact_pack=fact_pack, - fact_meta=fact_meta, - history_lines=history_lines, - focus_tags=focus_tags, - avoid_tags=avoid_tags, - avoid_fact_ids=[], - count=3, - subjective=subjective, - state=state, - step=2, - model=model, - ) - focus_label = interpretation.get("focus_label") or "primary angle" - candidate = _open_ended_candidate( - prompt, - focus=str(focus_label), - fact_pack=fact_pack, - history_lines=history_lines, - subjective=subjective, - tone=str(tone), - allow_list=allow_list, - state=state, - step=3, - fact_hints=primary_ids, - model=model, - ) - reply = _open_ended_synthesize( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - candidates=[candidate], - subjective=subjective, - tone=str(tone), - allow_list=allow_list, - state=state, - step=4, - model=model, - critique=None, - ) - if state: - state.update("done", step=total_steps) - return reply - angles = _open_ended_plan( prompt, fact_pack=fact_pack, @@ -3757,41 +3707,52 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: if mode == "fast": - return 4 + return 2 return 9 def _fast_fact_lines( fact_lines: list[str], fact_meta: dict[str, dict[str, Any]], - fact_ids: list[str], + *, + focus_tags: set[str], + avoid_tags: set[str], + limit: int = 10, ) -> list[str]: - if not fact_ids: - return fact_lines - selected = [ - line - for line in fact_lines - if fact_meta.get(line, {}).get("id") in set(fact_ids) - ] - return selected or fact_lines + if not fact_lines: + return [] + selected: list[str] = [] + for idx, line in enumerate(fact_lines): + fid = f"F{idx + 1}" + tags = set(fact_meta.get(fid, {}).get("tags") or []) + if focus_tags and not (focus_tags & tags): + continue + if avoid_tags and (avoid_tags & tags): + continue + selected.append(line) + if len(selected) >= limit: + break + if selected: + return selected + trimmed = fact_lines[:limit] + return trimmed or fact_lines def _open_ended_fast_single( prompt: str, *, - fact_pack: str, - history_lines: list[str], + context: str, state: ThoughtState | None = None, model: str, ) -> str: if state: - state.update("drafting", step=2, note="summarizing") - context = fact_pack + state.update("drafting", step=1, note="summarizing") reply = _ollama_call( ("atlasbot_fast", "atlasbot_fast"), prompt, context=context, use_history=False, + system_override=_open_ended_system(), model=model, ) if state: @@ -3808,14 +3769,28 @@ def _open_ended_fast( history_lines: list[str], state: ThoughtState | None = None, ) -> str: - return _open_ended_multi( + model = _model_for_mode("fast") + subjective = _is_subjective_query(prompt) + focus_tags = _preferred_tags_for_prompt(prompt) + if not focus_tags and subjective: + focus_tags = set(_ALLOWED_INSIGHT_TAGS) + avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set() + selected_lines = _fast_fact_lines( + fact_lines, + fact_meta, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + ) + selected_meta = _fact_pack_meta(selected_lines) + selected_pack = _fact_pack_text(selected_lines, selected_meta) + context = _append_history_context(selected_pack, history_lines) + if state: + state.total_steps = _open_ended_total_steps("fast") + return _open_ended_fast_single( prompt, - fact_pack=fact_pack, - fact_lines=fact_lines, - fact_meta=fact_meta, - history_lines=history_lines, - mode="fast", + context=context, state=state, + model=model, ) @@ -3834,7 +3809,6 @@ def _open_ended_deep( fact_lines=fact_lines, fact_meta=fact_meta, history_lines=history_lines, - mode="deep", state=state, ) From 980c2cf1cc6721619299ac0d3e53ccf061fecd9b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:09:34 -0300 Subject: [PATCH 402/416] atlasbot: enrich fact pack summaries --- services/comms/scripts/atlasbot/bot.py | 28 ++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6f18b9e..96765b1 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1037,6 +1037,11 @@ def facts_context( nodes_list = by_hardware.get(key) or [] if nodes_list: lines.append(f"- {key}: {', '.join(nodes_list)}") + if by_hardware: + counts = {key: len(nodes_list) for key, nodes_list in by_hardware.items() if nodes_list} + if counts: + parts = [f"{key}={count}" for key, count in sorted(counts.items())] + lines.append(f"- nodes_by_hardware_count: {', '.join(parts)}") non_rpi = sorted(set(by_hardware.get("jetson", [])) | set(by_hardware.get("amd64", []))) if non_rpi: lines.append(f"- non_raspberry_pi: {', '.join(non_rpi)}") @@ -1096,6 +1101,25 @@ def facts_context( value = metrics.get(key) if value is not None: lines.append(f"- {key}: {value}") + if workloads: + ns_counts: dict[str, int] = collections.defaultdict(int) + for entry in workloads: + if not isinstance(entry, dict): + continue + ns = entry.get("namespace") or "" + pods = entry.get("pods_running") + if pods is None: + pods = entry.get("pods_total") + try: + pods_val = int(pods) + except (TypeError, ValueError): + pods_val = 0 + if ns: + ns_counts[ns] += pods_val + if ns_counts: + top_ns = sorted(ns_counts.items(), key=lambda item: item[1], reverse=True)[:5] + parts = [f"{ns}={count}" for ns, count in top_ns] + lines.append(f"- pods_by_namespace: {', '.join(parts)}") top_restarts = metrics.get("top_restarts_1h") if isinstance(metrics.get("top_restarts_1h"), list) else [] if top_restarts: @@ -2725,6 +2749,8 @@ def _fact_line_tags(line: str) -> set[str]: tags.add("database") if "pods_" in text or "pod phases" in text or "restarts" in text: tags.add("pods") + if "namespace" in text: + tags.add("workloads") if "workloads" in text or "primary_node" in text or "workload_" in text: tags.add("workloads") if "node_details" in text: @@ -2900,6 +2926,8 @@ def _open_ended_system() -> str: "If the question asks for a list, embed the list inline in a sentence (comma-separated). " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " + "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. " + "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. " "Do not invent numbers or facts. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)." ) From 971848558aa32f837ff16f2cb91ce41ea338079c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:14:12 -0300 Subject: [PATCH 403/416] atlasbot: prioritize fact selection for quick answers --- services/comms/scripts/atlasbot/bot.py | 56 +++++++++++++++++++++----- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 96765b1..43f578b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3169,6 +3169,23 @@ def _preferred_tags_for_prompt(prompt: str) -> set[str]: return tags & _ALLOWED_INSIGHT_TAGS +def _primary_tags_for_prompt(prompt: str) -> set[str]: + q = normalize_query(prompt) + if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")): + return {"utilization"} + if any(word in q for word in ("postgres", "database", "db", "connections")): + return {"database"} + if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")): + return {"pods"} + if any(word in q for word in ("workload", "service", "namespace")): + return {"workloads"} + if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")): + return {"availability"} + if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")): + return {"hardware", "inventory", "architecture"} + return set() + + _TAG_KEYWORDS: dict[str, tuple[str, ...]] = { "utilization": ("cpu", "ram", "memory", "net", "network", "io", "disk", "usage", "utilization", "hottest", "busy"), "database": ("postgres", "db", "database", "connections"), @@ -3745,25 +3762,43 @@ def _fast_fact_lines( *, focus_tags: set[str], avoid_tags: set[str], + primary_tags: set[str] | None = None, limit: int = 10, ) -> list[str]: if not fact_lines: return [] - selected: list[str] = [] + primary_tags = primary_tags or set() + scored: list[tuple[int, int, str]] = [] for idx, line in enumerate(fact_lines): fid = f"F{idx + 1}" tags = set(fact_meta.get(fid, {}).get("tags") or []) - if focus_tags and not (focus_tags & tags): - continue if avoid_tags and (avoid_tags & tags): continue - selected.append(line) + score = 0 + if primary_tags: + score += 4 * len(tags & primary_tags) + if focus_tags: + score += 2 * len(tags & focus_tags) + scored.append((score, idx, line)) + scored.sort(key=lambda item: (-item[0], item[1])) + selected: list[str] = [] + for score, _, line in scored: + if score <= 0 and selected: + break + if score > 0: + selected.append(line) if len(selected) >= limit: break - if selected: - return selected - trimmed = fact_lines[:limit] - return trimmed or fact_lines + if not selected: + selected = [line for _, _, line in scored[:limit]] + elif len(selected) < limit: + for _, _, line in scored: + if line in selected: + continue + selected.append(line) + if len(selected) >= limit: + break + return selected def _open_ended_fast_single( @@ -3799,6 +3834,7 @@ def _open_ended_fast( ) -> str: model = _model_for_mode("fast") subjective = _is_subjective_query(prompt) + primary_tags = _primary_tags_for_prompt(prompt) focus_tags = _preferred_tags_for_prompt(prompt) if not focus_tags and subjective: focus_tags = set(_ALLOWED_INSIGHT_TAGS) @@ -3808,15 +3844,15 @@ def _open_ended_fast( fact_meta, focus_tags=focus_tags, avoid_tags=avoid_tags, + primary_tags=primary_tags, ) selected_meta = _fact_pack_meta(selected_lines) selected_pack = _fact_pack_text(selected_lines, selected_meta) - context = _append_history_context(selected_pack, history_lines) if state: state.total_steps = _open_ended_total_steps("fast") return _open_ended_fast_single( prompt, - context=context, + context=selected_pack, state=state, model=model, ) From be82109d4e43af3cd80e3137fe623e29d3b91d1f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:17:46 -0300 Subject: [PATCH 404/416] atlasbot: enforce fast answer body --- services/comms/scripts/atlasbot/bot.py | 28 ++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 43f578b..7d47423 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2926,6 +2926,7 @@ def _open_ended_system() -> str: "If the question asks for a list, embed the list inline in a sentence (comma-separated). " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " + "Always include at least one substantive answer sentence before the score lines. " "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. " "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. " "Do not invent numbers or facts. " @@ -3801,6 +3802,24 @@ def _fast_fact_lines( return selected +def _has_body_lines(answer: str) -> bool: + lines = [line.strip() for line in (answer or "").splitlines() if line.strip()] + for line in lines: + lowered = line.lower() + if lowered.startswith("confidence"): + continue + if lowered.startswith("relevance"): + continue + if lowered.startswith("satisfaction"): + continue + if lowered.startswith("hallucinationrisk"): + continue + if lowered.startswith("hallucination risk"): + continue + return True + return False + + def _open_ended_fast_single( prompt: str, *, @@ -3818,6 +3837,15 @@ def _open_ended_fast_single( system_override=_open_ended_system(), model=model, ) + if not _has_body_lines(reply): + reply = _ollama_call( + ("atlasbot_fast", "atlasbot_fast"), + prompt + " Provide one clear sentence before the score lines.", + context=context, + use_history=False, + system_override=_open_ended_system(), + model=model, + ) if state: state.update("done", step=_open_ended_total_steps("fast")) return _ensure_scores(reply) From 8316e5dd15f23b8c99a18969364aff2034291072 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:20:28 -0300 Subject: [PATCH 405/416] atlasbot: fix tag detection for workload queries --- services/comms/scripts/atlasbot/bot.py | 29 ++++++++++++++------------ 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7d47423..b73d3f3 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3154,35 +3154,37 @@ def _open_ended_interpret( def _preferred_tags_for_prompt(prompt: str) -> set[str]: q = normalize_query(prompt) + tokens = set(_tokens(prompt)) tags: set[str] = set() - if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")): + if tokens & {"cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load"}: tags.add("utilization") - if any(word in q for word in ("postgres", "database", "db", "connections")): + if tokens & {"postgres", "database", "db", "connections"}: tags.add("database") - if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")): + if tokens & {"pod", "pods", "deployment", "job", "cronjob"}: tags.add("pods") - if any(word in q for word in ("workload", "service", "namespace")): + if tokens & {"workload", "service", "namespace"}: tags.add("workloads") - if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")): + if tokens & {"ready", "down", "unreachable", "availability"} or "not ready" in q: tags.add("availability") - if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")): + if tokens & {"node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane"}: tags.update({"hardware", "inventory", "architecture"}) return tags & _ALLOWED_INSIGHT_TAGS def _primary_tags_for_prompt(prompt: str) -> set[str]: q = normalize_query(prompt) - if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")): + tokens = set(_tokens(prompt)) + if tokens & {"cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load"}: return {"utilization"} - if any(word in q for word in ("postgres", "database", "db", "connections")): + if tokens & {"postgres", "database", "db", "connections"}: return {"database"} - if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")): + if tokens & {"pod", "pods", "deployment", "job", "cronjob"}: return {"pods"} - if any(word in q for word in ("workload", "service", "namespace")): + if tokens & {"workload", "service", "namespace"}: return {"workloads"} - if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")): + if tokens & {"ready", "down", "unreachable", "availability"} or "not ready" in q: return {"availability"} - if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")): + if tokens & {"node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane"}: return {"hardware", "inventory", "architecture"} return set() @@ -3202,9 +3204,10 @@ def _tags_from_text(text: str) -> set[str]: q = normalize_query(text) if not q: return set() + tokens = set(_tokens(text)) tags: set[str] = set() for tag, keywords in _TAG_KEYWORDS.items(): - if any(word in q for word in keywords): + if any(word in tokens for word in keywords): tags.add(tag) return tags & _ALLOWED_INSIGHT_TAGS From 885e7b648957a79027afae52dbfde2c1ebb88394 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:23:54 -0300 Subject: [PATCH 406/416] comms: use 14b model for atlasbot quick --- services/comms/atlasbot-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d570fd9..6fbd327 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-91 + checksum/atlasbot-configmap: manual-atlasbot-92 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -84,7 +84,7 @@ spec: - name: OLLAMA_MODEL value: qwen2.5:14b-instruct - name: ATLASBOT_MODEL_FAST - value: qwen2.5:7b-instruct-q4_0 + value: qwen2.5:14b-instruct-q4_0 - name: ATLASBOT_MODEL_DEEP value: qwen2.5:14b-instruct - name: OLLAMA_FALLBACK_MODEL From 19b52ac5e3efcf49d8933348b6877b3d80088da1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:29:21 -0300 Subject: [PATCH 407/416] atlasbot: add fact-pack fallback for fast --- services/comms/scripts/atlasbot/bot.py | 35 ++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index b73d3f3..4fa67d4 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3823,6 +3823,37 @@ def _has_body_lines(answer: str) -> bool: return False +def _fallback_fact_answer(prompt: str, context: str) -> str: + facts: list[str] = [] + for line in (context or "").splitlines(): + trimmed = line.strip() + if not trimmed.startswith("F"): + continue + if ":" not in trimmed: + continue + fact = trimmed.split(":", 1)[1].strip() + if fact.startswith("-"): + fact = fact.lstrip("-").strip() + if fact: + facts.append(fact) + if not facts: + return "" + tokens = set(_tokens(prompt)) + best_fact = "" + best_score = -1 + for fact in facts: + score = len(tokens & set(_tokens(fact))) + if score > best_score: + best_score = score + best_fact = fact + if best_score <= 0: + return "" + sentence = f"Based on the snapshot, {best_fact}" + if not sentence.endswith((".", "!", "?")): + sentence += "." + return sentence + + def _open_ended_fast_single( prompt: str, *, @@ -3849,6 +3880,10 @@ def _open_ended_fast_single( system_override=_open_ended_system(), model=model, ) + if not _has_body_lines(reply): + fallback = _fallback_fact_answer(prompt, context) + if fallback: + reply = fallback if state: state.update("done", step=_open_ended_total_steps("fast")) return _ensure_scores(reply) From a9d74a066fb26946f2172103a8ca274de076d316 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:32:17 -0300 Subject: [PATCH 408/416] atlasbot: prefer fact fallback for quantitative prompts --- services/comms/scripts/atlasbot/bot.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 4fa67d4..8806d2a 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3854,6 +3854,18 @@ def _fallback_fact_answer(prompt: str, context: str) -> str: return sentence +def _is_quantitative_prompt(prompt: str) -> bool: + q = normalize_query(prompt) + if not q: + return False + tokens = set(_tokens(prompt)) + if "how many" in q or "count" in tokens or "total" in tokens: + return True + if tokens & {"highest", "lowest", "hottest", "most", "least"}: + return True + return False + + def _open_ended_fast_single( prompt: str, *, @@ -3880,10 +3892,9 @@ def _open_ended_fast_single( system_override=_open_ended_system(), model=model, ) - if not _has_body_lines(reply): - fallback = _fallback_fact_answer(prompt, context) - if fallback: - reply = fallback + fallback = _fallback_fact_answer(prompt, context) + if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)): + reply = fallback if state: state.update("done", step=_open_ended_total_steps("fast")) return _ensure_scores(reply) From eb567fda0654c51ade2ac94fd6e8ac634f2e339e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:35:02 -0300 Subject: [PATCH 409/416] atlasbot: fix fallback fact parsing --- services/comms/scripts/atlasbot/bot.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 8806d2a..e0f8417 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3829,9 +3829,12 @@ def _fallback_fact_answer(prompt: str, context: str) -> str: trimmed = line.strip() if not trimmed.startswith("F"): continue - if ":" not in trimmed: + match = re.match(r"^F\\d+.*?\\]:\\s*(.*)$", trimmed) + if not match: + match = re.match(r"^F\\d+:\\s*(.*)$", trimmed) + if not match: continue - fact = trimmed.split(":", 1)[1].strip() + fact = match.group(1).strip() if fact.startswith("-"): fact = fact.lstrip("-").strip() if fact: From 7194cad0a8fdb040ae5ca34a7c65c785f84f2f10 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:46:06 -0300 Subject: [PATCH 410/416] atlasbot: refine fast fact selection and prompts --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 56 +++++++++++++++++++++---- 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 6fbd327..f007942 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-92 + checksum/atlasbot-configmap: manual-atlasbot-93 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e0f8417..5ce1984 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -253,11 +253,13 @@ def normalize_query(text: str) -> str: cleaned = (text or "").lower() for ch in _DASH_CHARS: cleaned = cleaned.replace(ch, "-") + cleaned = cleaned.replace("_", " ") cleaned = re.sub(r"\s+", " ", cleaned).strip() return cleaned def _tokens(text: str) -> list[str]: - toks = [t.lower() for t in TOKEN_RE.findall(text or "")] + cleaned = re.sub(r"[\\_/]", " ", text or "") + toks = [t.lower() for t in TOKEN_RE.findall(cleaned)] return [t for t in toks if t not in STOPWORDS and len(t) >= 2] @@ -2730,6 +2732,18 @@ _ALLOWED_INSIGHT_TAGS = { _DYNAMIC_TAGS = {"availability", "database", "pods", "utilization", "workloads"} _INVENTORY_TAGS = {"hardware", "architecture", "inventory", "workers", "node_detail", "os"} +_SUBJECTIVE_TAG_PRIORITY = ( + "utilization", + "database", + "pods", + "workloads", + "availability", + "hardware", + "inventory", + "architecture", + "node_detail", + "os", +) def _fact_line_tags(line: str) -> set[str]: @@ -2922,7 +2936,8 @@ def _open_ended_system() -> str: "You may draw light inferences if you label them as such. " "Write concise, human sentences with a helpful, calm tone (not a list). " "Be willing to take a light stance; do not over-hedge. " - "If the question is subjective (cool/interesting/unconventional), pick a standout fact and explain why it stands out. " + "If the question is subjective (cool/interesting/unconventional), pick a standout fact, explain why it stands out, " + "and use 2-3 sentences. " "If the question asks for a list, embed the list inline in a sentence (comma-separated). " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " @@ -3773,6 +3788,8 @@ def _fast_fact_lines( return [] primary_tags = primary_tags or set() scored: list[tuple[int, int, str]] = [] + priority_map = {tag: idx for idx, tag in enumerate(_SUBJECTIVE_TAG_PRIORITY)} + use_priority = not primary_tags and focus_tags == _ALLOWED_INSIGHT_TAGS for idx, line in enumerate(fact_lines): fid = f"F{idx + 1}" tags = set(fact_meta.get(fid, {}).get("tags") or []) @@ -3783,6 +3800,12 @@ def _fast_fact_lines( score += 4 * len(tags & primary_tags) if focus_tags: score += 2 * len(tags & focus_tags) + if use_priority and tags: + bonus = 0 + for tag in tags: + if tag in priority_map: + bonus = max(bonus, len(priority_map) - priority_map[tag]) + score += bonus scored.append((score, idx, line)) scored.sort(key=lambda item: (-item[0], item[1])) selected: list[str] = [] @@ -3845,13 +3868,27 @@ def _fallback_fact_answer(prompt: str, context: str) -> str: best_fact = "" best_score = -1 for fact in facts: - score = len(tokens & set(_tokens(fact))) + key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", fact) + if not key_match: + key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", fact) + key_tokens: set[str] = set() + if key_match: + key_tokens = set(_tokens(key_match.group(1))) + score = len(tokens & set(_tokens(fact))) + 2 * len(tokens & key_tokens) if score > best_score: best_score = score best_fact = fact if best_score <= 0: return "" - sentence = f"Based on the snapshot, {best_fact}" + key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", best_fact) + if not key_match: + key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", best_fact) + if key_match: + key = key_match.group(1).strip().replace("_", " ") + val = key_match.group(2).strip() + sentence = f"{key.capitalize()} is {val}" + else: + sentence = f"Based on the snapshot, {best_fact}" if not sentence.endswith((".", "!", "?")): sentence += "." return sentence @@ -3873,15 +3910,17 @@ def _open_ended_fast_single( prompt: str, *, context: str, + history_lines: list[str] | None = None, state: ThoughtState | None = None, model: str, ) -> str: if state: state.update("drafting", step=1, note="summarizing") + working_context = _append_history_context(context, history_lines or []) if history_lines else context reply = _ollama_call( ("atlasbot_fast", "atlasbot_fast"), prompt, - context=context, + context=working_context, use_history=False, system_override=_open_ended_system(), model=model, @@ -3890,7 +3929,7 @@ def _open_ended_fast_single( reply = _ollama_call( ("atlasbot_fast", "atlasbot_fast"), prompt + " Provide one clear sentence before the score lines.", - context=context, + context=working_context, use_history=False, system_override=_open_ended_system(), model=model, @@ -3933,6 +3972,7 @@ def _open_ended_fast( return _open_ended_fast_single( prompt, context=selected_pack, + history_lines=history_lines, state=state, model=model, ) @@ -4089,7 +4129,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): cleaned_q = normalize_query(cleaned) cluster_affinity = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) subjective = _is_subjective_query(cleaned) - followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS) + followup_affinity = any(word in cleaned_q for word in METRIC_HINT_WORDS) contextual = history_cluster and (followup or followup_affinity) cluster_query = cluster_affinity or contextual context = "" @@ -4633,7 +4673,7 @@ def sync_loop(token: str, room_id: str): cleaned_q = normalize_query(cleaned_body) cluster_affinity = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) subjective = _is_subjective_query(cleaned_body) - followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS) + followup_affinity = any(word in cleaned_q for word in METRIC_HINT_WORDS) contextual = history_cluster and (followup or followup_affinity) cluster_query = cluster_affinity or contextual context = "" From 7c0a25a0eb4de1148135ab67b97faec641b0409a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:51:37 -0300 Subject: [PATCH 411/416] atlasbot: expand fast context for quantitative prompts --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index f007942..7856eed 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-93 + checksum/atlasbot-configmap: manual-atlasbot-94 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 5ce1984..81212ff 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3967,6 +3967,8 @@ def _open_ended_fast( ) selected_meta = _fact_pack_meta(selected_lines) selected_pack = _fact_pack_text(selected_lines, selected_meta) + if _is_quantitative_prompt(prompt) or not selected_lines: + selected_pack = fact_pack if state: state.total_steps = _open_ended_total_steps("fast") return _open_ended_fast_single( From da94cc6f97a27f0254c4de1299fc2ba7165eee86 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:56:26 -0300 Subject: [PATCH 412/416] atlasbot: improve fast fallback and usage filtering --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 36 ++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7856eed..7994618 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-94 + checksum/atlasbot-configmap: manual-atlasbot-95 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 81212ff..357941b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -923,7 +923,7 @@ def _nodes_by_arch(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: grouped[(node.get("arch") or "unknown")].append(node["name"]) return {k: sorted(v) for k, v in grouped.items()} -def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]: +def _node_usage_table(metrics: dict[str, Any], *, allowed_nodes: set[str] | None = None) -> list[dict[str, Any]]: usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} per_node: dict[str, dict[str, Any]] = {} for metric_name, entries in usage.items() if isinstance(usage, dict) else []: @@ -935,6 +935,8 @@ def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]: node = entry.get("node") if not isinstance(node, str) or not node: continue + if allowed_nodes and node not in allowed_nodes: + continue per_node.setdefault(node, {})[metric_name] = entry.get("value") return [{"node": node, **vals} for node, vals in sorted(per_node.items())] @@ -1139,7 +1141,8 @@ def facts_context( if items: lines.append(f"- top_restarts_1h: {', '.join(items)}") - usage_table = _node_usage_table(metrics) + allowed_nodes = {node.get("name") for node in inv if isinstance(node, dict) and node.get("name")} + usage_table = _node_usage_table(metrics, allowed_nodes=allowed_nodes or None) if usage_table: lines.append("- node_usage (cpu/ram/net/io):") for entry in usage_table: @@ -3906,6 +3909,31 @@ def _is_quantitative_prompt(prompt: str) -> bool: return False +def _is_list_prompt(prompt: str) -> bool: + q = normalize_query(prompt) + if not q: + return False + if any(phrase in q for phrase in ("list", "names", "name", "show")): + return True + if any(phrase in q for phrase in ("which nodes", "what nodes", "what are the nodes")): + return True + return False + + +def _needs_full_fact_pack(prompt: str) -> bool: + q = normalize_query(prompt) + tokens = set(_tokens(prompt)) + if _is_quantitative_prompt(prompt) or _is_list_prompt(prompt): + return True + if tokens & {"workload", "pods", "namespace"}: + return True + if _NAME_INDEX and tokens & _NAME_INDEX: + return True + if any(phrase in q for phrase in ("where does", "where is", "where are", "running", "run on", "hosted on", "primary node")): + return True + return False + + def _open_ended_fast_single( prompt: str, *, @@ -3937,6 +3965,8 @@ def _open_ended_fast_single( fallback = _fallback_fact_answer(prompt, context) if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)): reply = fallback + if not _has_body_lines(reply): + reply = "I don't have enough data in the current snapshot to answer that." if state: state.update("done", step=_open_ended_total_steps("fast")) return _ensure_scores(reply) @@ -3967,7 +3997,7 @@ def _open_ended_fast( ) selected_meta = _fact_pack_meta(selected_lines) selected_pack = _fact_pack_text(selected_lines, selected_meta) - if _is_quantitative_prompt(prompt) or not selected_lines: + if _needs_full_fact_pack(prompt) or not selected_lines: selected_pack = fact_pack if state: state.total_steps = _open_ended_total_steps("fast") From 043d1cbab39201b1888d80708e42efbd602f8d08 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 04:00:13 -0300 Subject: [PATCH 413/416] atlasbot: clean fact labels and non-cluster confidence --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7994618..58a5564 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-95 + checksum/atlasbot-configmap: manual-atlasbot-96 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 357941b..59a8c2d 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2945,6 +2945,7 @@ def _open_ended_system() -> str: "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " "Always include at least one substantive answer sentence before the score lines. " + "Do not mention fact IDs or internal labels (e.g., F1/F2) in your response. " "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. " "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. " "Do not invent numbers or facts. " @@ -4091,6 +4092,7 @@ def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> s system_override=system, model=model, ) + reply = re.sub(r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*", "", reply, flags=re.IGNORECASE).strip() return _ensure_scores(reply) From dda943ce16248b3bc964b04ebf7e247552698a63 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 04:06:24 -0300 Subject: [PATCH 414/416] atlasbot: expand full-pack triggers and strip inline confidence --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 58a5564..7001190 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-96 + checksum/atlasbot-configmap: manual-atlasbot-97 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 59a8c2d..6f3581f 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3030,7 +3030,14 @@ def _ensure_scores(answer: str) -> str: ): _record_score("hallucinationrisk", _extract_value(cleaned)) continue - body_lines.append(line) + cleaned_body = re.sub( + r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*", + "", + line, + flags=re.IGNORECASE, + ).strip() + if cleaned_body: + body_lines.append(cleaned_body) confidence = score_map.get("confidence") or "medium" relevance = score_map.get("relevance") or "70" @@ -3926,7 +3933,7 @@ def _needs_full_fact_pack(prompt: str) -> bool: tokens = set(_tokens(prompt)) if _is_quantitative_prompt(prompt) or _is_list_prompt(prompt): return True - if tokens & {"workload", "pods", "namespace"}: + if tokens & {"workload", "pods", "namespace", "worker", "workers"}: return True if _NAME_INDEX and tokens & _NAME_INDEX: return True From 436e56c5de8fee5e6e1eed1251c95c80e7c25624 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 04:10:31 -0300 Subject: [PATCH 415/416] atlasbot: favor factual fallback in fast mode --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7001190..187cd6c 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-97 + checksum/atlasbot-configmap: manual-atlasbot-98 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6f3581f..7fcc066 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2948,6 +2948,7 @@ def _open_ended_system() -> str: "Do not mention fact IDs or internal labels (e.g., F1/F2) in your response. " "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. " "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. " + "Do not convert counts into percentages or claim 100% unless a fact explicitly states a percentage. " "Do not invent numbers or facts. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)." ) @@ -4007,6 +4008,10 @@ def _open_ended_fast( selected_pack = _fact_pack_text(selected_lines, selected_meta) if _needs_full_fact_pack(prompt) or not selected_lines: selected_pack = fact_pack + if not subjective and _needs_full_fact_pack(prompt): + fallback = _fallback_fact_answer(prompt, fact_pack) + if fallback: + return _ensure_scores(fallback) if state: state.total_steps = _open_ended_total_steps("fast") return _open_ended_fast_single( From aa608fbf0f629e03c186324778b3cab995fc340b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 11:02:10 -0300 Subject: [PATCH 416/416] atlasbot: improve fact parsing and fallback answers --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 227 ++++++++++++++++++++++-- 2 files changed, 210 insertions(+), 19 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 187cd6c..b65aef0 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-98 + checksum/atlasbot-configmap: manual-atlasbot-101 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7fcc066..be256c0 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -260,7 +260,24 @@ def normalize_query(text: str) -> str: def _tokens(text: str) -> list[str]: cleaned = re.sub(r"[\\_/]", " ", text or "") toks = [t.lower() for t in TOKEN_RE.findall(cleaned)] - return [t for t in toks if t not in STOPWORDS and len(t) >= 2] + expanded: list[str] = [] + synonyms = { + "network": "net", + "net": "network", + "memory": "ram", + "ram": "memory", + "i/o": "io", + } + for token in toks: + expanded.append(token) + if "-" in token: + expanded.extend(part for part in token.split("-") if part) + for token in list(expanded): + if token in synonyms: + expanded.append(synonyms[token]) + if token.endswith("s") and len(token) > 3: + expanded.append(token.rstrip("s")) + return [t for t in expanded if t not in STOPWORDS and len(t) >= 2] def _ensure_confidence(text: str) -> str: @@ -1077,10 +1094,16 @@ def facts_context( lines.append(f"- expected_workers_missing: {', '.join(missing)}") hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + usage_metrics = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} for key in ("cpu", "ram", "net", "io"): entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} node = entry.get("node") value = entry.get("value") + if not node or value is None: + usage = usage_metrics.get(key) if isinstance(usage_metrics.get(key), list) else [] + pick = _node_usage_top(usage, allowed_nodes=None) + if pick: + node, value = pick if node and value is not None: value_fmt = _format_metric_value( str(value), @@ -3001,6 +3024,7 @@ def _ensure_scores(answer: str) -> str: def _record_score(key: str, value: str): if not value: return + value = value.strip().rstrip("%") score_map.setdefault(key, value) for line in lines: @@ -3010,10 +3034,10 @@ def _ensure_scores(answer: str) -> str: "confidence" in lowered and "relevance" in lowered and "satisfaction" in lowered ): for key in ("confidence", "relevance", "satisfaction"): - match = re.search(rf"{key}\\s*[:=]?\\s*(\\d{{1,3}}|high|medium|low)", lowered) + match = re.search(rf"{key}\s*[:=]?\s*(\d{{1,3}}|high|medium|low)", lowered) if match: _record_score(key, match.group(1)) - risk_match = re.search(r"hallucination\\s*risk\\s*[:=]?\\s*(low|medium|high)", lowered) + risk_match = re.search(r"hallucination\s*risk\s*[:=]?\s*(low|medium|high)", lowered) if risk_match: _record_score("hallucinationrisk", risk_match.group(1)) continue @@ -3032,11 +3056,18 @@ def _ensure_scores(answer: str) -> str: _record_score("hallucinationrisk", _extract_value(cleaned)) continue cleaned_body = re.sub( - r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*", + r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", line, flags=re.IGNORECASE, ).strip() + cleaned_body = re.sub( + r"\bconfident\s*level\s*:\s*(high|medium|low)\b\.?\s*", + "", + cleaned_body, + flags=re.IGNORECASE, + ).strip() + cleaned_body = re.sub(r"\bF\d+\b", "", cleaned_body).strip() if cleaned_body: body_lines.append(cleaned_body) @@ -3860,41 +3891,195 @@ def _has_body_lines(answer: str) -> bool: def _fallback_fact_answer(prompt: str, context: str) -> str: facts: list[str] = [] + parsed_facts: list[tuple[str, str | None, str | None]] = [] + q = normalize_query(prompt) + tokens = set(_tokens(prompt)) for line in (context or "").splitlines(): trimmed = line.strip() - if not trimmed.startswith("F"): + if not trimmed: continue - match = re.match(r"^F\\d+.*?\\]:\\s*(.*)$", trimmed) - if not match: - match = re.match(r"^F\\d+:\\s*(.*)$", trimmed) - if not match: - continue - fact = match.group(1).strip() + if trimmed.startswith("F"): + match = re.match(r"^F\d+.*?\]:\s*(.*)$", trimmed) + if not match: + match = re.match(r"^F\d+:\s*(.*)$", trimmed) + if not match: + continue + fact = match.group(1).strip() + else: + if trimmed.lower().startswith("fact pack") or trimmed.lower().startswith("facts"): + continue + if trimmed.startswith("-"): + fact = trimmed.lstrip("-").strip() + else: + fact = trimmed if fact.startswith("-"): fact = fact.lstrip("-").strip() - if fact: + if fact and (":" in fact or "=" in fact): facts.append(fact) + key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", fact) + if not key_match: + key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", fact) + if key_match: + parsed_facts.append((fact, key_match.group(1).strip(), key_match.group(2).strip())) + else: + parsed_facts.append((fact, None, None)) if not facts: return "" - tokens = set(_tokens(prompt)) + + def _norm_key(text: str) -> str: + return normalize_query(text).replace(" ", "_") + + def _find_value(target: str) -> str | None: + for _fact, key, val in parsed_facts: + if key and _norm_key(key) == target: + return val + return None + + def _parse_counts(text: str) -> dict[str, int]: + counts: dict[str, int] = {} + for part in (text or "").split(","): + if "=" not in part: + continue + k, v = part.split("=", 1) + k = k.strip() + v = v.strip() + if not k or not v: + continue + try: + counts[k] = int(float(v)) + except ValueError: + continue + return counts + + def _parse_map(text: str) -> dict[str, str]: + mapping: dict[str, str] = {} + pattern = re.compile(r"(\w+)\s*=\s*([^=]+?)(?=(?:\s*,\s*\w+\s*=)|$)") + for match in pattern.finditer(text or ""): + mapping[match.group(1).strip()] = match.group(2).strip().strip(",") + return mapping + + list_intent = _is_list_prompt(prompt) or "name" in tokens + count_intent = _is_quantitative_prompt(prompt) and ("how many" in q or "count" in tokens or "number" in tokens) + hottest_intent = any(word in q for word in ("hottest", "highest", "most", "top", "busiest")) + metric = _detect_metric(q) + include_hw, _exclude_hw = _detect_hardware_filters(q) + + if hottest_intent and metric in {"cpu", "ram", "net", "io"}: + hottest_val = _find_value(f"hottest_{metric}") + if hottest_val: + return f"Hottest {metric} is {hottest_val}." + if hottest_intent and tokens & {"postgres", "database", "db", "connections"}: + hottest_db = _find_value("postgres_hottest_db") + if hottest_db: + return f"Hottest database is {hottest_db}." + + if count_intent and tokens & {"pods", "pod"}: + pending = _find_value("pods_pending") + failed = _find_value("pods_failed") + running = _find_value("pods_running") + succeeded = _find_value("pods_succeeded") + if "pending" in q and "failed" in q: + try: + total = float(pending or 0) + float(failed or 0) + return f"Pods pending or failed: {total:.0f}." + except ValueError: + pass + if "pending" in q and pending is not None: + return f"Pods pending is {pending}." + if "failed" in q and failed is not None: + return f"Pods failed is {failed}." + if "succeeded" in q and succeeded is not None: + return f"Pods succeeded is {succeeded}." + if "running" in q and running is not None: + return f"Pods running is {running}." + + if count_intent and tokens & {"nodes", "node"} and "not ready" in q: + nodes_total = _find_value("nodes_total") + if nodes_total and "not_ready" in nodes_total: + match = re.search(r"not_ready=([0-9.]+)", nodes_total) + if match: + return f"Not ready nodes: {match.group(1)}." + + if count_intent and include_hw: + counts_line = _find_value("nodes_by_hardware_count") + if counts_line: + counts = _parse_counts(counts_line) + for hw in include_hw: + if hw in counts: + return f"{hw} nodes: {counts[hw]}." + for hw in include_hw: + hw_line = _find_value(hw) + if hw_line: + items = [item.strip() for item in hw_line.split(",") if item.strip()] + return f"{hw} nodes: {len(items)}." + + if list_intent and include_hw: + if "control" in q: + cp_by_hw = _find_value("control_plane_by_hardware") + if cp_by_hw: + mapping = _parse_map(cp_by_hw) + for hw in include_hw: + if hw in mapping: + return f"{hw} control-plane nodes: {mapping[hw]}." + cp_nodes = _find_value("control_plane_nodes") + if cp_nodes: + return f"Control-plane nodes: {cp_nodes}." + for hw in include_hw: + hw_line = _find_value(hw) + if hw_line: + return f"{hw} nodes: {hw_line}." + + if list_intent and "control" in q: + cp_nodes = _find_value("control_plane_nodes") + if cp_nodes: + return f"Control-plane nodes: {cp_nodes}." + + preferred = tokens & { + "node", + "nodes", + "pod", + "pods", + "postgres", + "db", + "database", + "namespace", + "workload", + "worker", + "workers", + "cpu", + "ram", + "memory", + "net", + "network", + "io", + "disk", + "connection", + "connections", + } best_fact = "" best_score = -1 for fact in facts: - key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", fact) + key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", fact) if not key_match: - key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", fact) + key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", fact) key_tokens: set[str] = set() if key_match: key_tokens = set(_tokens(key_match.group(1))) score = len(tokens & set(_tokens(fact))) + 2 * len(tokens & key_tokens) + if preferred: + score += 3 * len(preferred & key_tokens) + if not (preferred & key_tokens): + score -= 1 + if list_intent and key_match and "count" in key_tokens: + score -= 3 if score > best_score: best_score = score best_fact = fact if best_score <= 0: return "" - key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", best_fact) + key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", best_fact) if not key_match: - key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", best_fact) + key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", best_fact) if key_match: key = key_match.group(1).strip().replace("_", " ") val = key_match.group(2).strip() @@ -3936,6 +4121,10 @@ def _needs_full_fact_pack(prompt: str) -> bool: return True if tokens & {"workload", "pods", "namespace", "worker", "workers"}: return True + if tokens & {"arch", "architecture", "hardware"}: + return True + if tokens & METRIC_HINT_WORDS: + return True if _NAME_INDEX and tokens & _NAME_INDEX: return True if any(phrase in q for phrase in ("where does", "where is", "where are", "running", "run on", "hosted on", "primary node")): @@ -4104,7 +4293,7 @@ def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> s system_override=system, model=model, ) - reply = re.sub(r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*", "", reply, flags=re.IGNORECASE).strip() + reply = re.sub(r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", reply, flags=re.IGNORECASE).strip() return _ensure_scores(reply) @@ -4405,6 +4594,8 @@ def _is_cluster_query( return True if any(word in q for word in CLUSTER_HINT_WORDS): return True + if any(word in q for word in METRIC_HINT_WORDS): + return True for host_match in HOST_RE.finditer(q): host = host_match.group(1).lower() if host.endswith("bstein.dev"):