From 11a06e7683c07f43e2dd797de55d505648883e87 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 16:58:02 -0300 Subject: [PATCH 001/416] feat: add Ariadne service and glue scheduling --- scripts/dashboards_render_atlas.py | 37 ++++ .../vaultwarden-cred-sync-cronjob.yaml | 1 + services/finance/portal-rbac.yaml | 14 ++ services/health/portal-rbac.yaml | 16 +- .../health/wger-admin-ensure-cronjob.yaml | 1 + services/keycloak/realm-settings-job.yaml | 2 + services/mailu/mailu-sync-cronjob.yaml | 1 + services/maintenance/ariadne-deployment.yaml | 181 ++++++++++++++++++ services/maintenance/ariadne-service.yaml | 13 ++ .../maintenance/ariadne-serviceaccount.yaml | 8 + services/maintenance/kustomization.yaml | 6 + services/maintenance/secretproviderclass.yaml | 21 ++ .../maintenance/vault-serviceaccount.yaml | 6 + .../maintenance/vault-sync-deployment.yaml | 34 ++++ .../monitoring/dashboards/atlas-testing.json | 150 +++++++++++++++ .../monitoring/grafana-dashboard-testing.yaml | 150 +++++++++++++++ services/nextcloud-mail-sync/cronjob.yaml | 1 + services/nextcloud-mail-sync/portal-rbac.yaml | 13 ++ .../vault/scripts/vault_k8s_auth_configure.sh | 2 + services/vaultwarden/ariadne-rbac.yaml | 28 +++ services/vaultwarden/kustomization.yaml | 1 + 21 files changed, 685 insertions(+), 1 deletion(-) create mode 100644 services/maintenance/ariadne-deployment.yaml create mode 100644 services/maintenance/ariadne-service.yaml create mode 100644 services/maintenance/ariadne-serviceaccount.yaml create mode 100644 services/maintenance/secretproviderclass.yaml create mode 100644 services/maintenance/vault-serviceaccount.yaml create mode 100644 services/maintenance/vault-sync-deployment.yaml create mode 100644 services/vaultwarden/ariadne-rbac.yaml diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 0931b48b..116bf218 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -336,6 +336,10 @@ GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPE GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" +ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' +ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' +ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" +ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -2230,6 +2234,39 @@ def build_testing_dashboard(): instant=True, ) ) + panels.append( + table_panel( + 7, + "Ariadne Task Errors (24h)", + ARIADNE_TASK_ERRORS_24H, + {"h": 6, "w": 12, "x": 0, "y": 12}, + unit="none", + transformations=sort_desc, + instant=True, + ) + ) + panels.append( + table_panel( + 8, + "Ariadne Schedule Last Success (hours ago)", + ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, + {"h": 6, "w": 12, "x": 12, "y": 12}, + unit="h", + transformations=sort_desc, + instant=True, + ) + ) + panels.append( + table_panel( + 9, + "Ariadne Access Requests", + ARIADNE_ACCESS_REQUESTS, + {"h": 4, "w": 24, "x": 0, "y": 18}, + unit="none", + transformations=sort_desc, + instant=True, + ) + ) return { "uid": "atlas-testing", diff --git a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml index 29141fe4..acd851b1 100644 --- a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml +++ b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/finance/portal-rbac.yaml b/services/finance/portal-rbac.yaml index 2fb7eded..66eafea9 100644 --- a/services/finance/portal-rbac.yaml +++ b/services/finance/portal-rbac.yaml @@ -29,3 +29,17 @@ subjects: - kind: ServiceAccount name: bstein-dev-home namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-firefly-user-sync + namespace: finance +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-firefly-user-sync +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/health/portal-rbac.yaml b/services/health/portal-rbac.yaml index cd9acd19..feb74414 100644 --- a/services/health/portal-rbac.yaml +++ b/services/health/portal-rbac.yaml @@ -8,7 +8,7 @@ rules: - apiGroups: ["batch"] resources: ["cronjobs"] verbs: ["get"] - resourceNames: ["wger-user-sync"] + resourceNames: ["wger-user-sync", "wger-admin-ensure"] - apiGroups: ["batch"] resources: ["jobs"] verbs: ["create", "get", "list", "watch"] @@ -29,3 +29,17 @@ subjects: - kind: ServiceAccount name: bstein-dev-home namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-wger-user-sync + namespace: health +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-wger-user-sync +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/health/wger-admin-ensure-cronjob.yaml b/services/health/wger-admin-ensure-cronjob.yaml index db178a30..a1063dd9 100644 --- a/services/health/wger-admin-ensure-cronjob.yaml +++ b/services/health/wger-admin-ensure-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "15 3 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index f6802005..a0b36ec5 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -331,6 +331,8 @@ spec: # Ensure basic realm groups exist for provisioning. ensure_group("dev") ensure_group("admin") + ensure_group("demo") + ensure_group("test") planka_group = ensure_group("planka-users") if planka_group and planka_group.get("id"): diff --git a/services/mailu/mailu-sync-cronjob.yaml b/services/mailu/mailu-sync-cronjob.yaml index 1da19810..671439d5 100644 --- a/services/mailu/mailu-sync-cronjob.yaml +++ b/services/mailu/mailu-sync-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "30 4 * * *" + suspend: true concurrencyPolicy: Forbid jobTemplate: spec: diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml new file mode 100644 index 00000000..fd2fb797 --- /dev/null +++ b/services/maintenance/ariadne-deployment.yaml @@ -0,0 +1,181 @@ +# services/maintenance/ariadne-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ariadne + namespace: maintenance +spec: + replicas: 1 + revisionHistoryLimit: 3 + selector: + matchLabels: + app: ariadne + template: + metadata: + labels: + app: ariadne + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: "maintenance" + vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/portal/atlas-portal-db" + vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | + {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" + {{ end }} + {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }} + export KEYCLOAK_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" + {{ end }} + {{ with secret "kv/data/atlas/mailu/mailu-db-secret" }} + export MAILU_DB_NAME="{{ .Data.data.database }}" + export MAILU_DB_USER="{{ .Data.data.username }}" + export MAILU_DB_PASSWORD="{{ .Data.data.password }}" + {{ end }} + {{ with secret "kv/data/atlas/mailu/mailu-initial-account-secret" }} + export SMTP_HOST="mailu-front.mailu-mailserver.svc.cluster.local" + export SMTP_PORT="587" + export SMTP_STARTTLS="true" + export SMTP_USE_TLS="false" + export SMTP_USERNAME="no-reply-portal@bstein.dev" + export SMTP_PASSWORD="{{ .Data.data.password }}" + export SMTP_FROM="no-reply-portal@bstein.dev" + {{ end }} + spec: + serviceAccountName: ariadne + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: ariadne + image: registry.bstein.dev/bstein/ariadne:0.1.0 + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - >- + . /vault/secrets/ariadne-env.sh + && exec uvicorn ariadne.app:app --host 0.0.0.0 --port 8080 + ports: + - name: http + containerPort: 8080 + env: + - name: KEYCLOAK_URL + value: https://sso.bstein.dev + - name: KEYCLOAK_REALM + value: atlas + - name: KEYCLOAK_CLIENT_ID + value: bstein-dev-home + - name: KEYCLOAK_ISSUER + value: https://sso.bstein.dev/realms/atlas + - name: KEYCLOAK_JWKS_URL + value: http://keycloak.sso.svc.cluster.local/realms/atlas/protocol/openid-connect/certs + - name: KEYCLOAK_ADMIN_URL + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_ADMIN_REALM + value: atlas + - name: KEYCLOAK_ADMIN_CLIENT_ID + value: bstein-dev-home-admin + - name: PORTAL_PUBLIC_BASE_URL + value: https://bstein.dev + - name: PORTAL_ADMIN_USERS + value: bstein + - name: PORTAL_ADMIN_GROUPS + value: admin + - name: ACCOUNT_ALLOWED_GROUPS + value: dev,admin + - name: ALLOWED_FLAG_GROUPS + value: demo,test + - name: DEFAULT_USER_GROUPS + value: dev + - name: MAILU_DOMAIN + value: bstein.dev + - name: MAILU_SYNC_URL + value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events + - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC + value: "60" + - name: MAILU_DB_HOST + value: postgres-service.postgres.svc.cluster.local + - name: MAILU_DB_PORT + value: "5432" + - name: NEXTCLOUD_NAMESPACE + value: nextcloud + - name: NEXTCLOUD_MAIL_SYNC_CRONJOB + value: nextcloud-mail-sync + - name: NEXTCLOUD_MAIL_SYNC_WAIT_TIMEOUT_SEC + value: "90" + - name: NEXTCLOUD_MAIL_SYNC_JOB_TTL_SEC + value: "3600" + - name: WGER_NAMESPACE + value: health + - name: WGER_USER_SYNC_CRONJOB + value: wger-user-sync + - name: WGER_ADMIN_CRONJOB + value: wger-admin-ensure + - name: WGER_USER_SYNC_WAIT_TIMEOUT_SEC + value: "90" + - name: FIREFLY_NAMESPACE + value: finance + - name: FIREFLY_USER_SYNC_CRONJOB + value: firefly-user-sync + - name: FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC + value: "90" + - name: VAULTWARDEN_NAMESPACE + value: vaultwarden + - name: VAULTWARDEN_POD_LABEL + value: app=vaultwarden + - name: VAULTWARDEN_POD_PORT + value: "80" + - name: VAULTWARDEN_SERVICE_HOST + value: vaultwarden-service.vaultwarden.svc.cluster.local + - name: VAULTWARDEN_ADMIN_SECRET_NAME + value: vaultwarden-admin + - name: VAULTWARDEN_ADMIN_SECRET_KEY + value: ADMIN_TOKEN + - name: VAULTWARDEN_ADMIN_SESSION_TTL_SEC + value: "900" + - name: VAULTWARDEN_ADMIN_RATE_LIMIT_BACKOFF_SEC + value: "600" + - name: VAULTWARDEN_RETRY_COOLDOWN_SEC + value: "1800" + - name: VAULTWARDEN_FAILURE_BAILOUT + value: "2" + - name: ARIADNE_PROVISION_POLL_INTERVAL_SEC + value: "5" + - name: ARIADNE_PROVISION_RETRY_COOLDOWN_SEC + value: "30" + - name: ARIADNE_SCHEDULE_TICK_SEC + value: "5" + - name: ARIADNE_SCHEDULE_MAILU_SYNC + value: "30 4 * * *" + - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC + value: "0 5 * * *" + - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC + value: "*/15 * * * *" + - name: ARIADNE_SCHEDULE_WGER_ADMIN + value: "15 3 * * *" + - name: WELCOME_EMAIL_ENABLED + value: "true" + - name: K8S_API_TIMEOUT_SEC + value: "5" + - name: METRICS_PATH + value: "/metrics" + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 10 diff --git a/services/maintenance/ariadne-service.yaml b/services/maintenance/ariadne-service.yaml new file mode 100644 index 00000000..9c93e1df --- /dev/null +++ b/services/maintenance/ariadne-service.yaml @@ -0,0 +1,13 @@ +# services/maintenance/ariadne-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: ariadne + namespace: maintenance +spec: + selector: + app: ariadne + ports: + - name: http + port: 80 + targetPort: http diff --git a/services/maintenance/ariadne-serviceaccount.yaml b/services/maintenance/ariadne-serviceaccount.yaml new file mode 100644 index 00000000..9adcef7e --- /dev/null +++ b/services/maintenance/ariadne-serviceaccount.yaml @@ -0,0 +1,8 @@ +# services/maintenance/ariadne-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ariadne + namespace: maintenance +imagePullSecrets: + - name: harbor-regcred diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index e53ed3c7..f0f3de52 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -3,10 +3,16 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - namespace.yaml + - secretproviderclass.yaml + - vault-serviceaccount.yaml + - vault-sync-deployment.yaml + - ariadne-serviceaccount.yaml - disable-k3s-traefik-serviceaccount.yaml - k3s-traefik-cleanup-rbac.yaml - node-nofile-serviceaccount.yaml - pod-cleaner-rbac.yaml + - ariadne-deployment.yaml + - ariadne-service.yaml - disable-k3s-traefik-daemonset.yaml - k3s-traefik-cleanup-job.yaml - node-nofile-daemonset.yaml diff --git a/services/maintenance/secretproviderclass.yaml b/services/maintenance/secretproviderclass.yaml new file mode 100644 index 00000000..dd959480 --- /dev/null +++ b/services/maintenance/secretproviderclass.yaml @@ -0,0 +1,21 @@ +# services/maintenance/secretproviderclass.yaml +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: maintenance-vault + namespace: maintenance +spec: + provider: vault + parameters: + vaultAddress: "http://vault.vault.svc.cluster.local:8200" + roleName: "maintenance" + objects: | + - objectName: "harbor-pull__dockerconfigjson" + secretPath: "kv/data/atlas/harbor-pull/maintenance" + secretKey: "dockerconfigjson" + secretObjects: + - secretName: harbor-regcred + type: kubernetes.io/dockerconfigjson + data: + - objectName: harbor-pull__dockerconfigjson + key: .dockerconfigjson diff --git a/services/maintenance/vault-serviceaccount.yaml b/services/maintenance/vault-serviceaccount.yaml new file mode 100644 index 00000000..f60b43ec --- /dev/null +++ b/services/maintenance/vault-serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/maintenance/vault-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: maintenance-vault-sync + namespace: maintenance diff --git a/services/maintenance/vault-sync-deployment.yaml b/services/maintenance/vault-sync-deployment.yaml new file mode 100644 index 00000000..edc04561 --- /dev/null +++ b/services/maintenance/vault-sync-deployment.yaml @@ -0,0 +1,34 @@ +# services/maintenance/vault-sync-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: maintenance-vault-sync + namespace: maintenance +spec: + replicas: 1 + selector: + matchLabels: + app: maintenance-vault-sync + template: + metadata: + labels: + app: maintenance-vault-sync + spec: + serviceAccountName: maintenance-vault-sync + containers: + - name: sync + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - "sleep infinity" + volumeMounts: + - name: vault-secrets + mountPath: /vault/secrets + readOnly: true + volumes: + - name: vault-secrets + csi: + driver: secrets-store.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: maintenance-vault diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index 25cf3f83..c9c0c9ab 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -321,6 +321,156 @@ } } ] + }, + { + "id": 7, + "type": "table", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 8, + "type": "table", + "title": "Ariadne Schedule Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 9, + "type": "table", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "expr": "ariadne_access_requests_total", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] } ], "time": { diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 80a70438..7746f165 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -330,6 +330,156 @@ data: } } ] + }, + { + "id": 7, + "type": "table", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 8, + "type": "table", + "title": "Ariadne Schedule Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 9, + "type": "table", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "expr": "ariadne_access_requests_total", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] } ], "time": { diff --git a/services/nextcloud-mail-sync/cronjob.yaml b/services/nextcloud-mail-sync/cronjob.yaml index 2073d76e..6913b603 100644 --- a/services/nextcloud-mail-sync/cronjob.yaml +++ b/services/nextcloud-mail-sync/cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "0 5 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 3 failedJobsHistoryLimit: 1 diff --git a/services/nextcloud-mail-sync/portal-rbac.yaml b/services/nextcloud-mail-sync/portal-rbac.yaml index dc9a4e4b..009b2e08 100644 --- a/services/nextcloud-mail-sync/portal-rbac.yaml +++ b/services/nextcloud-mail-sync/portal-rbac.yaml @@ -27,3 +27,16 @@ subjects: - kind: ServiceAccount name: bstein-dev-home namespace: bstein-dev-home +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-nextcloud-mail-sync +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bstein-dev-home-nextcloud-mail-sync +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index 202879f4..ca94ac66 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -230,6 +230,8 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ "crypto/* harbor-pull/crypto" "" write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" +write_policy_and_role "maintenance" "maintenance" "ariadne" \ + "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret harbor-pull/maintenance" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ diff --git a/services/vaultwarden/ariadne-rbac.yaml b/services/vaultwarden/ariadne-rbac.yaml new file mode 100644 index 00000000..ee903ca8 --- /dev/null +++ b/services/vaultwarden/ariadne-rbac.yaml @@ -0,0 +1,28 @@ +# services/vaultwarden/ariadne-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: ariadne-vaultwarden-admin-reader + namespace: vaultwarden +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get"] + resourceNames: ["vaultwarden-admin"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ariadne-vaultwarden-admin-reader + namespace: vaultwarden +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: ariadne-vaultwarden-admin-reader +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance diff --git a/services/vaultwarden/kustomization.yaml b/services/vaultwarden/kustomization.yaml index c53cb1c6..ca5ef269 100644 --- a/services/vaultwarden/kustomization.yaml +++ b/services/vaultwarden/kustomization.yaml @@ -5,6 +5,7 @@ namespace: vaultwarden resources: - namespace.yaml - serviceaccount.yaml + - ariadne-rbac.yaml - pvc.yaml - deployment.yaml - service.yaml From f3620aa2a4e04ed5b186769963e1915001372c90 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 19:02:14 -0300 Subject: [PATCH 002/416] chore: centralize harbor pull credentials --- .../longhorn/core/secretproviderclass.yaml | 2 +- .../bstein-dev-home/secretproviderclass.yaml | 2 +- services/comms/secretproviderclass.yaml | 2 +- .../crypto/xmr-miner/secretproviderclass.yaml | 2 +- services/harbor/secretproviderclass.yaml | 2 +- services/keycloak/secretproviderclass.yaml | 2 +- services/logging/secretproviderclass.yaml | 2 +- services/mailu/secretproviderclass.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 2 +- services/maintenance/image.yaml | 21 ++++++++++++++++++ services/maintenance/kustomization.yaml | 5 +++++ services/maintenance/secretproviderclass.yaml | 2 +- services/monitoring/secretproviderclass.yaml | 2 +- services/pegasus/secretproviderclass.yaml | 2 +- .../vault/scripts/vault_k8s_auth_configure.sh | 22 +++++++++---------- 15 files changed, 49 insertions(+), 23 deletions(-) create mode 100644 services/maintenance/image.yaml diff --git a/infrastructure/longhorn/core/secretproviderclass.yaml b/infrastructure/longhorn/core/secretproviderclass.yaml index 031d1d8a..e292b86a 100644 --- a/infrastructure/longhorn/core/secretproviderclass.yaml +++ b/infrastructure/longhorn/core/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "longhorn" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/longhorn" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: longhorn-registry diff --git a/services/bstein-dev-home/secretproviderclass.yaml b/services/bstein-dev-home/secretproviderclass.yaml index f330fe68..2fa714a9 100644 --- a/services/bstein-dev-home/secretproviderclass.yaml +++ b/services/bstein-dev-home/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "bstein-dev-home" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/bstein-dev-home" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/comms/secretproviderclass.yaml b/services/comms/secretproviderclass.yaml index 69d4b2b3..0a895527 100644 --- a/services/comms/secretproviderclass.yaml +++ b/services/comms/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "comms" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/comms" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/crypto/xmr-miner/secretproviderclass.yaml b/services/crypto/xmr-miner/secretproviderclass.yaml index a72097fc..12e4ba19 100644 --- a/services/crypto/xmr-miner/secretproviderclass.yaml +++ b/services/crypto/xmr-miner/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "crypto" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/crypto" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/harbor/secretproviderclass.yaml b/services/harbor/secretproviderclass.yaml index 03fef95a..636f6fa8 100644 --- a/services/harbor/secretproviderclass.yaml +++ b/services/harbor/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "harbor" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/harbor" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/keycloak/secretproviderclass.yaml b/services/keycloak/secretproviderclass.yaml index 86cebd24..d4c094f2 100644 --- a/services/keycloak/secretproviderclass.yaml +++ b/services/keycloak/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "sso" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/sso" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/logging/secretproviderclass.yaml b/services/logging/secretproviderclass.yaml index f5db15ee..6ff642d2 100644 --- a/services/logging/secretproviderclass.yaml +++ b/services/logging/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "logging" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/logging" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/mailu/secretproviderclass.yaml b/services/mailu/secretproviderclass.yaml index f58c69b3..f9e281e5 100644 --- a/services/mailu/secretproviderclass.yaml +++ b/services/mailu/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "mailu-mailserver" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/mailu-mailserver" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index fd2fb797..ee4884da 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -49,7 +49,7 @@ spec: node-role.kubernetes.io/worker: "true" containers: - name: ariadne - image: registry.bstein.dev/bstein/ariadne:0.1.0 + image: registry.bstein.dev/bstein/ariadne:0.1.0-0 imagePullPolicy: Always command: ["/bin/sh", "-c"] args: diff --git a/services/maintenance/image.yaml b/services/maintenance/image.yaml new file mode 100644 index 00000000..95acbd0b --- /dev/null +++ b/services/maintenance/image.yaml @@ -0,0 +1,21 @@ +# services/maintenance/image.yaml +apiVersion: image.toolkit.fluxcd.io/v1beta2 +kind: ImageRepository +metadata: + name: ariadne + namespace: maintenance +spec: + image: registry.bstein.dev/bstein/ariadne + interval: 1m0s +--- +apiVersion: image.toolkit.fluxcd.io/v1beta2 +kind: ImagePolicy +metadata: + name: ariadne + namespace: maintenance +spec: + imageRepositoryRef: + name: ariadne + policy: + semver: + range: ">=0.1.0-0" diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index f0f3de52..5e199a98 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - namespace.yaml + - image.yaml - secretproviderclass.yaml - vault-serviceaccount.yaml - vault-sync-deployment.yaml @@ -22,6 +23,10 @@ resources: - node-image-sweeper-daemonset.yaml - image-sweeper-cronjob.yaml +images: + - name: registry.bstein.dev/bstein/ariadne + newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:ariadne"} + configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance diff --git a/services/maintenance/secretproviderclass.yaml b/services/maintenance/secretproviderclass.yaml index dd959480..85df2af5 100644 --- a/services/maintenance/secretproviderclass.yaml +++ b/services/maintenance/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "maintenance" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/maintenance" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/monitoring/secretproviderclass.yaml b/services/monitoring/secretproviderclass.yaml index 8a6c5fbb..350d6aa3 100644 --- a/services/monitoring/secretproviderclass.yaml +++ b/services/monitoring/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "monitoring" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/monitoring" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/pegasus/secretproviderclass.yaml b/services/pegasus/secretproviderclass.yaml index b4621a57..b8d1df96 100644 --- a/services/pegasus/secretproviderclass.yaml +++ b/services/pegasus/secretproviderclass.yaml @@ -11,7 +11,7 @@ spec: roleName: "pegasus" objects: | - objectName: "harbor-pull__dockerconfigjson" - secretPath: "kv/data/atlas/harbor-pull/jellyfin" + secretPath: "kv/data/atlas/shared/harbor-pull" secretKey: "dockerconfigjson" secretObjects: - secretName: harbor-regcred diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index ca94ac66..c7eaf859 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -203,42 +203,42 @@ write_policy_and_role "outline" "outline" "outline-vault" \ write_policy_and_role "planka" "planka" "planka-vault" \ "planka/* shared/postmark-relay" "" write_policy_and_role "bstein-dev-home" "bstein-dev-home" "bstein-dev-home,bstein-dev-home-vault-sync" \ - "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay mailu/mailu-initial-account-secret harbor-pull/bstein-dev-home" "" + "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay mailu/mailu-initial-account-secret shared/harbor-pull" "" write_policy_and_role "gitea" "gitea" "gitea-vault" \ "gitea/*" "" write_policy_and_role "vaultwarden" "vaultwarden" "vaultwarden-vault" \ "vaultwarden/* mailu/mailu-initial-account-secret" "" write_policy_and_role "sso" "sso" "sso-vault,sso-vault-sync,mas-secrets-ensure" \ - "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin shared/portal-e2e-client shared/postmark-relay harbor-pull/sso" "" + "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin shared/portal-e2e-client shared/postmark-relay shared/harbor-pull" "" write_policy_and_role "mailu-mailserver" "mailu-mailserver" "mailu-vault-sync" \ - "mailu/* shared/postmark-relay harbor-pull/mailu-mailserver" "" + "mailu/* shared/postmark-relay shared/harbor-pull" "" write_policy_and_role "harbor" "harbor" "harbor-vault-sync" \ - "harbor/* harbor-pull/harbor" "" + "harbor/* shared/harbor-pull" "" write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \ "nextcloud/* shared/keycloak-admin shared/postmark-relay" "" write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \ - "comms/* shared/chat-ai-keys-runtime harbor-pull/comms" "" + "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" "" write_policy_and_role "jenkins" "jenkins" "jenkins" \ "jenkins/*" "" write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \ - "monitoring/* shared/postmark-relay harbor-pull/monitoring" "" + "monitoring/* shared/postmark-relay shared/harbor-pull" "" write_policy_and_role "logging" "logging" "logging-vault-sync" \ - "logging/* harbor-pull/logging" "" + "logging/* shared/harbor-pull" "" write_policy_and_role "pegasus" "jellyfin" "pegasus-vault-sync" \ - "pegasus/* harbor-pull/jellyfin" "" + "pegasus/* shared/harbor-pull" "" write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ - "crypto/* harbor-pull/crypto" "" + "crypto/* shared/harbor-pull" "" write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" write_policy_and_role "maintenance" "maintenance" "ariadne" \ - "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret harbor-pull/maintenance" "" + "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ "" \ "finance/*" write_policy_and_role "longhorn" "longhorn-system" "longhorn-vault,longhorn-vault-sync" \ - "longhorn/* harbor-pull/longhorn" "" + "longhorn/* shared/harbor-pull" "" write_policy_and_role "postgres" "postgres" "postgres-vault" \ "postgres/postgres-db" "" write_policy_and_role "vault" "vault" "vault" \ From a6b317097e172079ed89099bed139205d4f0b296 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 19:07:00 -0300 Subject: [PATCH 003/416] fix: allow maintenance vault sync role --- services/vault/scripts/vault_k8s_auth_configure.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index c7eaf859..a5ccb61d 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -230,7 +230,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ "crypto/* shared/harbor-pull" "" write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" -write_policy_and_role "maintenance" "maintenance" "ariadne" \ +write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" From 0e07ca791ee045ec060fae05b62474ee547b2663 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 19:22:53 -0300 Subject: [PATCH 004/416] feat: wire portal to ariadne --- services/bstein-dev-home/backend-deployment.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 376622c2..f3bca954 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -91,6 +91,10 @@ spec: value: atlas - name: KEYCLOAK_ADMIN_CLIENT_ID value: bstein-dev-home-admin + - name: ARIADNE_URL + value: http://ariadne.maintenance.svc.cluster.local:8080 + - name: ARIADNE_TIMEOUT_SEC + value: "10" - name: ACCOUNT_ALLOWED_GROUPS value: "" - name: HTTP_CHECK_TIMEOUT_SEC From 093d86a4e99aad3990bf8fbcfcc415cbf3d75479 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:03:50 -0300 Subject: [PATCH 005/416] chore: add maintenance image automation --- .../bstein-dev-home/image-automation.yaml | 4 +-- .../flux-system/platform/kustomization.yaml | 1 + .../maintenance/image-automation.yaml | 26 +++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 clusters/atlas/flux-system/platform/maintenance/image-automation.yaml diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml index 88dda408..643d4792 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml @@ -13,14 +13,14 @@ spec: git: checkout: ref: - branch: feature/vault-consumption + branch: feature/ariadne commit: author: email: ops@bstein.dev name: flux-bot messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}" push: - branch: feature/vault-consumption + branch: feature/ariadne update: strategy: Setters path: services/bstein-dev-home diff --git a/clusters/atlas/flux-system/platform/kustomization.yaml b/clusters/atlas/flux-system/platform/kustomization.yaml index b689cc04..6e75b040 100644 --- a/clusters/atlas/flux-system/platform/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/kustomization.yaml @@ -11,6 +11,7 @@ resources: - monitoring/kustomization.yaml - logging/kustomization.yaml - maintenance/kustomization.yaml + - maintenance/image-automation.yaml - longhorn-adopt/kustomization.yaml - longhorn/kustomization.yaml - longhorn-ui/kustomization.yaml diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml new file mode 100644 index 00000000..867cae48 --- /dev/null +++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml @@ -0,0 +1,26 @@ +# clusters/atlas/flux-system/platform/maintenance/image-automation.yaml +apiVersion: image.toolkit.fluxcd.io/v1 +kind: ImageUpdateAutomation +metadata: + name: maintenance + namespace: flux-system +spec: + interval: 1m0s + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + git: + checkout: + ref: + branch: feature/ariadne + commit: + author: + email: ops@bstein.dev + name: flux-bot + messageTemplate: "chore(maintenance): update images to {{range .Updated.Images}}{{.}}{{end}}" + push: + branch: feature/ariadne + update: + strategy: Setters + path: services/maintenance From 88ed989023c8ed31f7f6d3d33ef2e4cd64b98521 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:11:38 -0300 Subject: [PATCH 006/416] bstein-dev-home: bump images to 0.1.1-107 --- services/bstein-dev-home/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f9d3c87f..ec137dc6 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,9 +20,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-102 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"} + newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-103 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"} + newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From c377294d1b92df8c36794b57328e604b11ed7d87 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:33:34 -0300 Subject: [PATCH 007/416] fix: unblock keycloak and refresh glue checks --- ci/tests/glue/config.yaml | 9 +++++++++ ci/tests/glue/test_glue_metrics.py | 19 +++++++++++++++++++ services/jenkins/configmap-jcasc.yaml | 2 +- services/keycloak/realm-settings-job.yaml | 2 +- 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/ci/tests/glue/config.yaml b/ci/tests/glue/config.yaml index 8adf4ca0..16b656c2 100644 --- a/ci/tests/glue/config.yaml +++ b/ci/tests/glue/config.yaml @@ -1,7 +1,16 @@ max_success_age_hours: 48 allow_suspended: + - bstein-dev-home/vaultwarden-cred-sync - comms/othrys-room-reset - comms/pin-othrys-invite - comms/seed-othrys-room - finance/firefly-user-sync + - health/wger-admin-ensure - health/wger-user-sync + - mailu-mailserver/mailu-sync-nightly + - nextcloud/nextcloud-mail-sync +ariadne_schedule_tasks: + - schedule.mailu_sync + - schedule.nextcloud_sync + - schedule.vaultwarden_sync + - schedule.wger_admin diff --git a/ci/tests/glue/test_glue_metrics.py b/ci/tests/glue/test_glue_metrics.py index 16b01c7c..52ec0bef 100644 --- a/ci/tests/glue/test_glue_metrics.py +++ b/ci/tests/glue/test_glue_metrics.py @@ -1,11 +1,19 @@ from __future__ import annotations import os +from pathlib import Path import requests +import yaml VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/") +CONFIG_PATH = Path(__file__).with_name("config.yaml") + + +def _load_config() -> dict: + with CONFIG_PATH.open("r", encoding="utf-8") as handle: + return yaml.safe_load(handle) or {} def _query(promql: str) -> list[dict]: @@ -27,3 +35,14 @@ def test_glue_metrics_success_join(): ) series = _query(query) assert series, "No glue cronjob last success series found" + + +def test_ariadne_schedule_metrics_present(): + cfg = _load_config() + expected = cfg.get("ariadne_schedule_tasks", []) + if not expected: + return + series = _query("ariadne_schedule_next_run_timestamp_seconds") + tasks = {item.get("metric", {}).get("task") for item in series} + missing = [task for task in expected if task not in tasks] + assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}" diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index ac26350e..25dd748d 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -151,7 +151,7 @@ data: url('https://scm.bstein.dev/bstein/titan-iac.git') credentials('gitea-pat') } - branches('*/feature/vault-consumption') + branches('*/main') } } scriptPath('ci/Jenkinsfile.titan-iac') diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index a0b36ec5..fdee377c 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: keycloak-realm-settings-32 + name: keycloak-realm-settings-33 namespace: sso spec: backoffLimit: 0 From d25ca49c4949f61629827a3ef3e81de717ea3870 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:35:29 -0300 Subject: [PATCH 008/416] chore: run portal onboarding e2e job --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index f22272e0..201e3f57 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-19 + name: portal-onboarding-e2e-test-20 namespace: bstein-dev-home spec: backoffLimit: 0 From c508d7ade8e83b17b68a92654add1d1d5c96e41c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:38:22 -0300 Subject: [PATCH 009/416] fix: point portal at ariadne service --- services/bstein-dev-home/backend-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index f3bca954..074a19d0 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -92,7 +92,7 @@ spec: - name: KEYCLOAK_ADMIN_CLIENT_ID value: bstein-dev-home-admin - name: ARIADNE_URL - value: http://ariadne.maintenance.svc.cluster.local:8080 + value: http://ariadne.maintenance.svc.cluster.local - name: ARIADNE_TIMEOUT_SEC value: "10" - name: ACCOUNT_ALLOWED_GROUPS From ee5bfea07220491517e375a302d9ba3fd9e13dd7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:42:14 -0300 Subject: [PATCH 010/416] chore: rerun portal onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index 201e3f57..0b650903 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-20 + name: portal-onboarding-e2e-test-21 namespace: bstein-dev-home spec: backoffLimit: 0 From 9af9f28060c4e758d08287169c311a2a1ea94f24 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 22:49:23 -0300 Subject: [PATCH 011/416] fix: extend mailu mailbox wait for ariadne --- services/maintenance/ariadne-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index ee4884da..0543f80f 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -93,7 +93,7 @@ spec: - name: MAILU_SYNC_URL value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC - value: "60" + value: "180" - name: MAILU_DB_HOST value: postgres-service.postgres.svc.cluster.local - name: MAILU_DB_PORT From 60973d3f3c1a9ea50894bf031e95c71320973197 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:04:59 -0300 Subject: [PATCH 012/416] chore(maintenance): bump ariadne image tag --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 5e199a98..e09f6a84 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-1 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From 179023a1cccb238f0ee342a80597f7348276cb7e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:05:46 -0300 Subject: [PATCH 013/416] chore(portal): rerun onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index 0b650903..c9c1c044 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-21 + name: portal-onboarding-e2e-test-22 namespace: bstein-dev-home spec: backoffLimit: 0 From 564af1c1d436224ec4e9ee7840030c6678ab4014 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:28:07 -0300 Subject: [PATCH 014/416] fix(mailu): allow forced sync --- services/mailu/mailu-sync-listener.yaml | 2 +- services/mailu/scripts/mailu_sync_listener.py | 10 ++++++---- services/maintenance/kustomization.yaml | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/services/mailu/mailu-sync-listener.yaml b/services/mailu/mailu-sync-listener.yaml index cc98107f..b3d2acce 100644 --- a/services/mailu/mailu-sync-listener.yaml +++ b/services/mailu/mailu-sync-listener.yaml @@ -30,7 +30,7 @@ spec: app: mailu-sync-listener annotations: vault.hashicorp.com/agent-inject: "true" - atlas.bstein.dev/mailu-sync-rev: "2" + atlas.bstein.dev/mailu-sync-rev: "3" vault.hashicorp.com/role: "mailu-mailserver" vault.hashicorp.com/agent-inject-secret-mailu-db-secret__database: "kv/data/atlas/mailu/mailu-db-secret" vault.hashicorp.com/agent-inject-template-mailu-db-secret__database: | diff --git a/services/mailu/scripts/mailu_sync_listener.py b/services/mailu/scripts/mailu_sync_listener.py index 6ac0da7c..4e31c811 100644 --- a/services/mailu/scripts/mailu_sync_listener.py +++ b/services/mailu/scripts/mailu_sync_listener.py @@ -39,12 +39,12 @@ def _run_sync_blocking() -> int: sync_done.set() -def _trigger_sync_async() -> bool: +def _trigger_sync_async(force: bool = False) -> bool: with lock: now = time() if sync_running: return False - if now - last_run < MIN_INTERVAL_SECONDS: + if not force and now - last_run < MIN_INTERVAL_SECONDS: return False thread = threading.Thread(target=_run_sync_blocking, daemon=True) @@ -64,15 +64,17 @@ class Handler(http.server.BaseHTTPRequestHandler): return wait = False + force = False if isinstance(payload, dict): wait = bool(payload.get("wait")) + force = bool(payload.get("force")) if wait: with lock: already_running = sync_running if not already_running: - _trigger_sync_async() + _trigger_sync_async(force=force) sync_done.wait(timeout=WAIT_TIMEOUT_SECONDS) with lock: @@ -87,7 +89,7 @@ class Handler(http.server.BaseHTTPRequestHandler): self.end_headers() return - _trigger_sync_async() + _trigger_sync_async(force=force) self.send_response(202) self.end_headers() diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index e09f6a84..9255d889 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-1 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-2 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From 49292f9d8ba3a473f284de34015f4e7fcbe7b2f2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:31:45 -0300 Subject: [PATCH 015/416] chore(portal): rerun onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index c9c1c044..9dbe68df 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-22 + name: portal-onboarding-e2e-test-23 namespace: bstein-dev-home spec: backoffLimit: 0 From d07415e6230bd1b6fe0be53de8215a5f17cf114d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:45:31 -0300 Subject: [PATCH 016/416] core: fix postmark DNS and time sync --- infrastructure/core/coredns-custom.yaml | 3 ++ infrastructure/core/kustomization.yaml | 1 + infrastructure/core/ntp-sync-daemonset.yaml | 50 +++++++++++++++++++ .../postmark-exporter-deployment.yaml | 4 +- 4 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 infrastructure/core/ntp-sync-daemonset.yaml diff --git a/infrastructure/core/coredns-custom.yaml b/infrastructure/core/coredns-custom.yaml index 8aeff149..6266a22a 100644 --- a/infrastructure/core/coredns-custom.yaml +++ b/infrastructure/core/coredns-custom.yaml @@ -32,6 +32,9 @@ data: 192.168.22.9 notes.bstein.dev 192.168.22.9 office.bstein.dev 192.168.22.9 pegasus.bstein.dev + 3.136.224.193 pm-bounces.bstein.dev + 3.150.68.49 pm-bounces.bstein.dev + 18.189.137.81 pm-bounces.bstein.dev 192.168.22.9 registry.bstein.dev 192.168.22.9 scm.bstein.dev 192.168.22.9 secret.bstein.dev diff --git a/infrastructure/core/kustomization.yaml b/infrastructure/core/kustomization.yaml index 6286186d..257e1f06 100644 --- a/infrastructure/core/kustomization.yaml +++ b/infrastructure/core/kustomization.yaml @@ -6,5 +6,6 @@ resources: - ../modules/profiles/atlas-ha - coredns-custom.yaml - coredns-deployment.yaml + - ntp-sync-daemonset.yaml - ../sources/cert-manager/letsencrypt.yaml - ../sources/cert-manager/letsencrypt-prod.yaml diff --git a/infrastructure/core/ntp-sync-daemonset.yaml b/infrastructure/core/ntp-sync-daemonset.yaml new file mode 100644 index 00000000..ba972949 --- /dev/null +++ b/infrastructure/core/ntp-sync-daemonset.yaml @@ -0,0 +1,50 @@ +# infrastructure/core/ntp-sync-daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: ntp-sync + namespace: kube-system + labels: + app: ntp-sync +spec: + selector: + matchLabels: + app: ntp-sync + template: + metadata: + labels: + app: ntp-sync + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist + - key: node-role.kubernetes.io/master + operator: DoesNotExist + containers: + - name: ntp-sync + image: public.ecr.aws/docker/library/busybox:1.36.1 + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "-c"] + args: + - | + set -eu + while true; do + ntpd -q -p pool.ntp.org || true + sleep 300 + done + securityContext: + capabilities: + add: ["SYS_TIME"] + runAsUser: 0 + runAsGroup: 0 + resources: + requests: + cpu: 10m + memory: 16Mi + limits: + cpu: 50m + memory: 64Mi diff --git a/services/monitoring/postmark-exporter-deployment.yaml b/services/monitoring/postmark-exporter-deployment.yaml index 64062248..98791d95 100644 --- a/services/monitoring/postmark-exporter-deployment.yaml +++ b/services/monitoring/postmark-exporter-deployment.yaml @@ -18,9 +18,9 @@ spec: prometheus.io/path: "/metrics" vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "monitoring" - vault.hashicorp.com/agent-inject-secret-postmark-env: "kv/data/atlas/monitoring/postmark-exporter" + vault.hashicorp.com/agent-inject-secret-postmark-env: "kv/data/atlas/shared/postmark-relay" vault.hashicorp.com/agent-inject-template-postmark-env: | - {{- with secret "kv/data/atlas/monitoring/postmark-exporter" -}} + {{- with secret "kv/data/atlas/shared/postmark-relay" -}} export POSTMARK_SERVER_TOKEN="{{ index .Data.data "apikey" }}" export POSTMARK_SERVER_TOKEN_FALLBACK="{{ index .Data.data "apikey" }}" {{- if index .Data.data "sending-limit" }} From e9597660f9cf66a37bc8718db321ff77524e6b32 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:45:48 -0300 Subject: [PATCH 017/416] chore(maintenance): bump ariadne image tag --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 9255d889..35af46f3 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-2 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-3 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From d406a12b4ab0e0985244053b0b1108c1d2cc757e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:47:24 -0300 Subject: [PATCH 018/416] chore(portal): rerun onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index 9dbe68df..535b1dc2 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-23 + name: portal-onboarding-e2e-test-24 namespace: bstein-dev-home spec: backoffLimit: 0 From 5eae50ca4ccac7ec4fe3c4f8dab7012f7d8f7329 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:51:55 -0300 Subject: [PATCH 019/416] fix(mailu): pin sync workloads to arm64 --- services/mailu/mailu-sync-cronjob.yaml | 3 +++ services/mailu/mailu-sync-listener.yaml | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/services/mailu/mailu-sync-cronjob.yaml b/services/mailu/mailu-sync-cronjob.yaml index 671439d5..bbe9909e 100644 --- a/services/mailu/mailu-sync-cronjob.yaml +++ b/services/mailu/mailu-sync-cronjob.yaml @@ -38,6 +38,9 @@ spec: {{- with secret "kv/data/atlas/mailu/mailu-initial-account-secret" -}}{{ .Data.data.password }}{{- end -}} spec: restartPolicy: OnFailure + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" serviceAccountName: mailu-vault-sync containers: - name: mailu-sync diff --git a/services/mailu/mailu-sync-listener.yaml b/services/mailu/mailu-sync-listener.yaml index b3d2acce..0644c5bb 100644 --- a/services/mailu/mailu-sync-listener.yaml +++ b/services/mailu/mailu-sync-listener.yaml @@ -30,7 +30,7 @@ spec: app: mailu-sync-listener annotations: vault.hashicorp.com/agent-inject: "true" - atlas.bstein.dev/mailu-sync-rev: "3" + atlas.bstein.dev/mailu-sync-rev: "4" vault.hashicorp.com/role: "mailu-mailserver" vault.hashicorp.com/agent-inject-secret-mailu-db-secret__database: "kv/data/atlas/mailu/mailu-db-secret" vault.hashicorp.com/agent-inject-template-mailu-db-secret__database: | @@ -52,6 +52,9 @@ spec: {{- with secret "kv/data/atlas/mailu/mailu-initial-account-secret" -}}{{ .Data.data.password }}{{- end -}} spec: restartPolicy: Always + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" serviceAccountName: mailu-vault-sync containers: - name: listener From ba6b97b92a8d39a019895a239fd8e011dc71cb0e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 19 Jan 2026 23:58:37 -0300 Subject: [PATCH 020/416] chore(portal): rerun onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index 535b1dc2..505e1817 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-24 + name: portal-onboarding-e2e-test-25 namespace: bstein-dev-home spec: backoffLimit: 0 From 6b5a77b32e17a6c938684d45a09f0fa86eeab98a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 00:07:45 -0300 Subject: [PATCH 021/416] chore(maintenance): bump ariadne image tag --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 35af46f3..80c61dfe 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-3 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-4 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From 3995b28aa3cfd45a7cfea88843aff8af3e7b111b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 00:09:49 -0300 Subject: [PATCH 022/416] chore(portal): rerun onboarding e2e --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index 505e1817..a0b6569e 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-25 + name: portal-onboarding-e2e-test-26 namespace: bstein-dev-home spec: backoffLimit: 0 From cacb03b42f43bd764ae37cef50fc418b21910790 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 00:58:04 -0300 Subject: [PATCH 023/416] mailu: use postmark server token for relay --- services/mailu/helmrelease.yaml | 60 ++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 7342141a..9779aed8 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -335,8 +335,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -397,8 +403,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -459,8 +471,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -521,8 +539,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -583,8 +607,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync @@ -645,8 +675,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - export RELAYUSER="{{ index .Data.data "apikey" }}" - export RELAYPASSWORD="{{ index .Data.data "apikey" }}" + {{- $apikey := index .Data.data "apikey" -}} + {{- if $apikey }} + export RELAYUSER="{{ $apikey }}" + export RELAYPASSWORD="{{ $apikey }}" + {{- else }} + export RELAYUSER="{{ index .Data.data "accesskey" }}" + export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + {{- end }} {{ end }} spec: serviceAccountName: mailu-vault-sync From 6157ebd98b973d2ddb929f6aa4f78588027558c5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:04:04 -0300 Subject: [PATCH 024/416] mailu: prefer postmark smtp token for relay --- services/mailu/helmrelease.yaml | 78 ++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 9779aed8..4621a2d8 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -335,13 +335,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: @@ -403,13 +404,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: @@ -471,13 +473,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: @@ -539,13 +542,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: @@ -607,13 +611,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: @@ -675,13 +680,14 @@ spec: export INITIAL_ADMIN_PW="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/shared/postmark-relay" }} - {{- $apikey := index .Data.data "apikey" -}} - {{- if $apikey }} - export RELAYUSER="{{ $apikey }}" - export RELAYPASSWORD="{{ $apikey }}" + {{- $access := index .Data.data "accesskey" -}} + {{- $secret := index .Data.data "secretkey" -}} + {{- if and $access $secret }} + export RELAYUSER="{{ $access }}" + export RELAYPASSWORD="{{ $secret }}" {{- else }} - export RELAYUSER="{{ index .Data.data "accesskey" }}" - export RELAYPASSWORD="{{ index .Data.data "secretkey" }}" + export RELAYUSER="{{ index .Data.data "apikey" }}" + export RELAYPASSWORD="{{ index .Data.data "apikey" }}" {{- end }} {{ end }} spec: From 8c77d1569de5f50f0730ff0247f0c420446a2f5e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:05:06 -0300 Subject: [PATCH 025/416] ci: pin quality gate agents to rpi5 --- ci/Jenkinsfile.titan-iac | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/Jenkinsfile.titan-iac b/ci/Jenkinsfile.titan-iac index 3b13eb08..359dc94f 100644 --- a/ci/Jenkinsfile.titan-iac +++ b/ci/Jenkinsfile.titan-iac @@ -6,6 +6,10 @@ pipeline { apiVersion: v1 kind: Pod spec: + nodeSelector: + hardware: rpi5 + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" containers: - name: python image: python:3.12-slim From 4285c378a8368059c9c2222db0823347a7cabf46 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:07:01 -0300 Subject: [PATCH 026/416] mailu: recreate postfix on upgrade --- services/mailu/helmrelease.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 4621a2d8..e84b3760 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -455,6 +455,8 @@ spec: metadata: name: mailu-postfix spec: + strategy: + type: Recreate template: metadata: annotations: From e30afabdf05143059bf9e32911fe45955dbb52b7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:08:51 -0300 Subject: [PATCH 027/416] jenkins: re-target quality gate and restart --- services/jenkins/configmap-jcasc.yaml | 2 +- services/jenkins/deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 25dd748d..0a25aa17 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -75,7 +75,7 @@ data: url('https://scm.bstein.dev/bstein/titan-iac.git') credentials('gitea-pat') } - branches('*/main') + branches('*/feature/ariadne') } } scriptPath('services/jellyfin/oidc/Jenkinsfile') diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index e846a8ef..0e99caba 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -38,7 +38,7 @@ spec: GITEA_PAT_USERNAME={{ .Data.data.username }} GITEA_PAT_TOKEN={{ .Data.data.token }} {{- end -}} - bstein.dev/restarted-at: "2026-01-19T00:25:00Z" + bstein.dev/restarted-at: "2026-01-20T04:08:33Z" spec: serviceAccountName: jenkins nodeSelector: From 7d44006423c8956de297624a5cc86272dad17b91 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:14:30 -0300 Subject: [PATCH 028/416] jenkins: align quality gate branch --- services/jenkins/configmap-jcasc.yaml | 4 ++-- services/jenkins/deployment.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 0a25aa17..fcd01f90 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -75,7 +75,7 @@ data: url('https://scm.bstein.dev/bstein/titan-iac.git') credentials('gitea-pat') } - branches('*/feature/ariadne') + branches('*/main') } } scriptPath('services/jellyfin/oidc/Jenkinsfile') @@ -151,7 +151,7 @@ data: url('https://scm.bstein.dev/bstein/titan-iac.git') credentials('gitea-pat') } - branches('*/main') + branches('*/feature/ariadne') } } scriptPath('ci/Jenkinsfile.titan-iac') diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 0e99caba..dfbe5feb 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -38,7 +38,7 @@ spec: GITEA_PAT_USERNAME={{ .Data.data.username }} GITEA_PAT_TOKEN={{ .Data.data.token }} {{- end -}} - bstein.dev/restarted-at: "2026-01-20T04:08:33Z" + bstein.dev/restarted-at: "2026-01-20T04:14:13Z" spec: serviceAccountName: jenkins nodeSelector: From 29d0a376da0bb56b24ca1aac352b080a7087fe77 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 01:20:16 -0300 Subject: [PATCH 029/416] portal: rerun onboarding e2e job --- services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml index a0b6569e..681e89d2 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: portal-onboarding-e2e-test-26 + name: portal-onboarding-e2e-test-27 namespace: bstein-dev-home spec: backoffLimit: 0 From 6911e99e32badc4029be21eb2626b3907c8f9beb Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 02:30:44 -0300 Subject: [PATCH 030/416] mailu: rewrite double-bounce to base domain --- services/mailu/helmrelease.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index e84b3760..7eab19ac 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -219,6 +219,8 @@ spec: overrides: postfix.cf: | mynetworks = 127.0.0.0/8 [::1]/128 10.42.0.0/16 10.43.0.0/16 192.168.22.0/24 + recipient_canonical_maps = regexp:/overrides/recipient_canonical, ${podop}recipientmap + recipient_canonical_classes = envelope_recipient,header_recipient smtpd_delay_reject = yes smtpd_helo_required = yes smtpd_helo_restrictions = reject_invalid_helo_hostname, reject_non_fqdn_helo_hostname, reject_unknown_helo_hostname @@ -238,6 +240,8 @@ spec: smtpd_client_message_rate_limit = 100 smtpd_client_recipient_rate_limit = 200 smtpd_recipient_limit = 100 + recipient_canonical: | + /^double-bounce@mail\.bstein\.dev$/ double-bounce@bstein.dev podAnnotations: bstein.dev/restarted-at: "2026-01-06T00:00:00Z" redis: From 980daa683bdba94a0242b7e80c5757e37f65c1db Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 02:32:43 -0300 Subject: [PATCH 031/416] mailu: restart postfix to load canonical map --- services/mailu/helmrelease.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 7eab19ac..599faf13 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -243,7 +243,7 @@ spec: recipient_canonical: | /^double-bounce@mail\.bstein\.dev$/ double-bounce@bstein.dev podAnnotations: - bstein.dev/restarted-at: "2026-01-06T00:00:00Z" + bstein.dev/restarted-at: "2026-01-20T04:20:00Z" redis: enabled: true architecture: standalone From 2cbecde47831dd4b8aa2f48125dafb94d1cdb548 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 02:37:02 -0300 Subject: [PATCH 032/416] mailu: keep podop socketmap in canonical maps --- services/mailu/helmrelease.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 599faf13..9d8519bc 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -219,7 +219,7 @@ spec: overrides: postfix.cf: | mynetworks = 127.0.0.0/8 [::1]/128 10.42.0.0/16 10.43.0.0/16 192.168.22.0/24 - recipient_canonical_maps = regexp:/overrides/recipient_canonical, ${podop}recipientmap + recipient_canonical_maps = regexp:/overrides/recipient_canonical, socketmap:unix:/tmp/podop.socket:recipientmap recipient_canonical_classes = envelope_recipient,header_recipient smtpd_delay_reject = yes smtpd_helo_required = yes From 55c9993d084ff9fdb1a2bc78cd8d0ed7a6178a1d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 02:38:04 -0300 Subject: [PATCH 033/416] mailu: restart postfix after canonical map update --- services/mailu/helmrelease.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml index 9d8519bc..2a7e6f5f 100644 --- a/services/mailu/helmrelease.yaml +++ b/services/mailu/helmrelease.yaml @@ -243,7 +243,7 @@ spec: recipient_canonical: | /^double-bounce@mail\.bstein\.dev$/ double-bounce@bstein.dev podAnnotations: - bstein.dev/restarted-at: "2026-01-20T04:20:00Z" + bstein.dev/restarted-at: "2026-01-20T04:35:00Z" redis: enabled: true architecture: standalone From f8c368b21f124f0b3e75921a86810367f78add84 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 03:01:59 -0300 Subject: [PATCH 034/416] maintenance: extend Ariadne schedules and RBAC --- services/comms/guest-name-job.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 34 ++++++++++++++++++++ services/maintenance/ariadne-rbac.yaml | 29 +++++++++++++++++ services/maintenance/kustomization.yaml | 1 + services/vault/k8s-auth-config-cronjob.yaml | 1 + services/vault/oidc-config-cronjob.yaml | 1 + 6 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 services/maintenance/ariadne-rbac.yaml diff --git a/services/comms/guest-name-job.yaml b/services/comms/guest-name-job.yaml index 21a8af5f..3eae2dd2 100644 --- a/services/comms/guest-name-job.yaml +++ b/services/comms/guest-name-job.yaml @@ -8,7 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/1 * * * *" - suspend: false + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 1 diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 0543f80f..cd0d38c7 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -78,6 +78,8 @@ spec: value: bstein-dev-home-admin - name: PORTAL_PUBLIC_BASE_URL value: https://bstein.dev + - name: ARIADNE_LOG_LEVEL + value: INFO - name: PORTAL_ADMIN_USERS value: bstein - name: PORTAL_ADMIN_GROUPS @@ -120,6 +122,26 @@ spec: value: firefly-user-sync - name: FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC value: "90" + - name: VAULT_NAMESPACE + value: vault + - name: VAULT_K8S_AUTH_CRONJOB + value: vault-k8s-auth-config + - name: VAULT_OIDC_CRONJOB + value: vault-oidc-config + - name: VAULT_JOB_WAIT_TIMEOUT_SEC + value: "120" + - name: COMMS_NAMESPACE + value: comms + - name: COMMS_GUEST_NAME_CRONJOB + value: guest-name-randomizer + - name: COMMS_PIN_INVITE_CRONJOB + value: pin-othrys-invite + - name: COMMS_RESET_ROOM_CRONJOB + value: othrys-room-reset + - name: COMMS_SEED_ROOM_CRONJOB + value: seed-othrys-room + - name: COMMS_JOB_WAIT_TIMEOUT_SEC + value: "60" - name: VAULTWARDEN_NAMESPACE value: vaultwarden - name: VAULTWARDEN_POD_LABEL @@ -154,6 +176,18 @@ spec: value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_WGER_ADMIN value: "15 3 * * *" + - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH + value: "*/15 * * * *" + - name: ARIADNE_SCHEDULE_VAULT_OIDC + value: "*/15 * * * *" + - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME + value: "*/1 * * * *" + - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE + value: "*/30 * * * *" + - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM + value: "0 0 1 1 *" + - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM + value: "*/10 * * * *" - name: WELCOME_EMAIL_ENABLED value: "true" - name: K8S_API_TIMEOUT_SEC diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml new file mode 100644 index 00000000..8d2a2a9a --- /dev/null +++ b/services/maintenance/ariadne-rbac.yaml @@ -0,0 +1,29 @@ +# services/maintenance/ariadne-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: ariadne-job-spawner +rules: + - apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: + - get + - list + - watch + - create + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: ariadne-job-spawner +subjects: + - kind: ServiceAccount + name: ariadne + namespace: maintenance +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: ariadne-job-spawner diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 80c61dfe..0810f5e7 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -8,6 +8,7 @@ resources: - vault-serviceaccount.yaml - vault-sync-deployment.yaml - ariadne-serviceaccount.yaml + - ariadne-rbac.yaml - disable-k3s-traefik-serviceaccount.yaml - k3s-traefik-cleanup-rbac.yaml - node-nofile-serviceaccount.yaml diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml index 29e8e809..e7cca14e 100644 --- a/services/vault/k8s-auth-config-cronjob.yaml +++ b/services/vault/k8s-auth-config-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/vault/oidc-config-cronjob.yaml b/services/vault/oidc-config-cronjob.yaml index 013c9f32..4d317c55 100644 --- a/services/vault/oidc-config-cronjob.yaml +++ b/services/vault/oidc-config-cronjob.yaml @@ -8,6 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 From cf20c27cedd4fdf793faad7fdf83756a3c5f6540 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 03:21:36 -0300 Subject: [PATCH 035/416] ci(jenkins): add multibranch quality gate --- ci/Jenkinsfile.titan-iac | 23 +++++++++++++++-- services/jenkins/configmap-jcasc.yaml | 33 ++++++++++++++++--------- services/jenkins/configmap-plugins.yaml | 1 + services/maintenance/kustomization.yaml | 2 +- 4 files changed, 44 insertions(+), 15 deletions(-) diff --git a/ci/Jenkinsfile.titan-iac b/ci/Jenkinsfile.titan-iac index 359dc94f..77990d77 100644 --- a/ci/Jenkinsfile.titan-iac +++ b/ci/Jenkinsfile.titan-iac @@ -22,7 +22,6 @@ spec: environment { PIP_DISABLE_PIP_VERSION_CHECK = '1' PYTHONUNBUFFERED = '1' - DEPLOY_BRANCH = 'deploy' } stages { stage('Checkout') { @@ -40,7 +39,27 @@ spec: sh 'pytest -q ci/tests/glue' } } + stage('Resolve Flux branch') { + steps { + script { + env.FLUX_BRANCH = sh( + returnStdout: true, + script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml" + ).trim() + if (!env.FLUX_BRANCH) { + error('Flux branch not found in gotk-sync.yaml') + } + echo "Flux branch: ${env.FLUX_BRANCH}" + } + } + } stage('Promote') { + when { + expression { + def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '') + return env.FLUX_BRANCH && branch == env.FLUX_BRANCH + } + } steps { withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) { sh ''' @@ -48,7 +67,7 @@ spec: git config user.email "jenkins@bstein.dev" git config user.name "jenkins" git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git - git push origin HEAD:${DEPLOY_BRANCH} + git push origin HEAD:${FLUX_BRANCH} ''' } } diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index fcd01f90..62012f1c 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -139,24 +139,33 @@ data: } } } - pipelineJob('titan-iac-quality-gate') { - triggers { - scm('H/5 * * * *') - } - definition { - cpsScm { - scm { + multibranchPipelineJob('titan-iac-quality-gate') { + branchSources { + branchSource { + source { git { - remote { - url('https://scm.bstein.dev/bstein/titan-iac.git') - credentials('gitea-pat') - } - branches('*/feature/ariadne') + id('titan-iac-quality-gate') + remote('https://scm.bstein.dev/bstein/titan-iac.git') + credentialsId('gitea-pat') } } + } + } + factory { + workflowBranchProjectFactory { scriptPath('ci/Jenkinsfile.titan-iac') } } + orphanedItemStrategy { + discardOldItems { + numToKeep(30) + } + } + triggers { + periodicFolderTrigger { + interval('12h') + } + } } base.yaml: | jenkins: diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml index eabea13b..108c6461 100644 --- a/services/jenkins/configmap-plugins.yaml +++ b/services/jenkins/configmap-plugins.yaml @@ -9,6 +9,7 @@ data: kubernetes workflow-aggregator git + git-branch-source pipeline-utility-steps configuration-as-code configuration-as-code-support diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 0810f5e7..b7fe46b5 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-4 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-5 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From 652ab18e82989075af046fd7c4832ad10fc800b9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 03:30:48 -0300 Subject: [PATCH 036/416] ci(jenkins): add Ariadne pipeline job --- services/jenkins/configmap-jcasc.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 62012f1c..78d98fea 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -120,6 +120,25 @@ data: } } } + pipelineJob('ariadne') { + triggers { + scm('H/2 * * * *') + } + definition { + cpsScm { + scm { + git { + remote { + url('https://scm.bstein.dev/bstein/ariadne.git') + credentials('gitea-pat') + } + branches('*/master') + } + } + scriptPath('Jenkinsfile') + } + } + } pipelineJob('data-prepper') { triggers { scm('H/5 * * * *') From 5690376b72af6dd6c816c5ee57e792457680078c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 03:59:19 -0300 Subject: [PATCH 037/416] glue: preserve keycloak profile updates --- services/mailu/scripts/mailu_sync.py | 32 ++++++++++++++++++++++++- services/maintenance/kustomization.yaml | 2 +- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/services/mailu/scripts/mailu_sync.py b/services/mailu/scripts/mailu_sync.py index 001917ab..71b0f5a2 100644 --- a/services/mailu/scripts/mailu_sync.py +++ b/services/mailu/scripts/mailu_sync.py @@ -130,7 +130,9 @@ def kc_update_attributes(token, user, attributes): if not isinstance(current_attrs, dict): current_attrs = {} current_attrs.update(attributes) - resp = SESSION.put(user_url, headers=headers, json={"attributes": current_attrs}, timeout=20) + payload = _safe_update_payload(current_payload) + payload["attributes"] = current_attrs + resp = SESSION.put(user_url, headers=headers, json=payload, timeout=20) resp.raise_for_status() verify = SESSION.get( user_url, @@ -144,6 +146,34 @@ def kc_update_attributes(token, user, attributes): raise Exception(f"attribute not persisted for {user.get('email') or user['username']}") +def _safe_update_payload(user_payload: dict) -> dict: + payload: dict = {} + username = user_payload.get("username") + if isinstance(username, str): + payload["username"] = username + enabled = user_payload.get("enabled") + if isinstance(enabled, bool): + payload["enabled"] = enabled + email = user_payload.get("email") + if isinstance(email, str): + payload["email"] = email + email_verified = user_payload.get("emailVerified") + if isinstance(email_verified, bool): + payload["emailVerified"] = email_verified + first_name = user_payload.get("firstName") + if isinstance(first_name, str): + payload["firstName"] = first_name + last_name = user_payload.get("lastName") + if isinstance(last_name, str): + payload["lastName"] = last_name + actions = user_payload.get("requiredActions") + if isinstance(actions, list): + payload["requiredActions"] = [a for a in actions if isinstance(a, str)] + attrs = user_payload.get("attributes") + payload["attributes"] = attrs if isinstance(attrs, dict) else {} + return payload + + def random_password(): alphabet = string.ascii_letters + string.digits return "".join(secrets.choice(alphabet) for _ in range(24)) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index b7fe46b5..a86453e1 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-5 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-6 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From f0855b7a3f9f3abefae9dc18e0d1ef584202dcfd Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 09:06:39 -0300 Subject: [PATCH 038/416] gitea: allow jenkins webhook --- services/gitea/deployment.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/gitea/deployment.yaml b/services/gitea/deployment.yaml index 9dc0c878..da188c35 100644 --- a/services/gitea/deployment.yaml +++ b/services/gitea/deployment.yaml @@ -169,6 +169,8 @@ spec: value: "trace" - name: GITEA__service__REQUIRE_SIGNIN_VIEW value: "false" + - name: GITEA__webhook__ALLOWED_HOST_LIST + value: "ci.bstein.dev" - name: GITEA__server__PROXY_HEADERS value: "X-Forwarded-For, X-Forwarded-Proto, X-Forwarded-Host" - name: GITEA__session__COOKIE_SECURE From d21dc989f69729e869da9002ccb3113577c8423b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 09:37:21 -0300 Subject: [PATCH 039/416] jenkins: pin root url for OIDC --- services/jenkins/configmap-jcasc.yaml | 5 ++++- services/jenkins/deployment.yaml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 78d98fea..d4a29f1e 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -18,7 +18,7 @@ data: logoutFromOpenIdProvider: true postLogoutRedirectUrl: "https://ci.bstein.dev" sendScopesInTokenRequest: true - rootURLFromRequest: true + rootURLFromRequest: false userNameField: "preferred_username" fullNameFieldName: "name" emailFieldName: "email" @@ -245,3 +245,6 @@ data: crumbIssuer: standard: excludeClientIPFromCrumb: true + unclassified: + location: + url: "https://ci.bstein.dev/" diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index dfbe5feb..fdb8d107 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -38,7 +38,7 @@ spec: GITEA_PAT_USERNAME={{ .Data.data.username }} GITEA_PAT_TOKEN={{ .Data.data.token }} {{- end -}} - bstein.dev/restarted-at: "2026-01-20T04:14:13Z" + bstein.dev/restarted-at: "2026-01-20T05:05:00Z" spec: serviceAccountName: jenkins nodeSelector: From eb23881f6496fd81ff3a5900b53ae2333a77ee9e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 09:45:33 -0300 Subject: [PATCH 040/416] jenkins: drop removed multibranch plugin --- services/jenkins/configmap-jcasc.yaml | 33 +++++++++---------------- services/jenkins/configmap-plugins.yaml | 2 -- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index d4a29f1e..9e116c0e 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -158,33 +158,24 @@ data: } } } - multibranchPipelineJob('titan-iac-quality-gate') { - branchSources { - branchSource { - source { + pipelineJob('titan-iac-quality-gate') { + triggers { + scm('H/12 * * * *') + } + definition { + cpsScm { + scm { git { - id('titan-iac-quality-gate') - remote('https://scm.bstein.dev/bstein/titan-iac.git') - credentialsId('gitea-pat') + remote { + url('https://scm.bstein.dev/bstein/titan-iac.git') + credentials('gitea-pat') + } + branches('*/main') } } - } - } - factory { - workflowBranchProjectFactory { scriptPath('ci/Jenkinsfile.titan-iac') } } - orphanedItemStrategy { - discardOldItems { - numToKeep(30) - } - } - triggers { - periodicFolderTrigger { - interval('12h') - } - } } base.yaml: | jenkins: diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml index 108c6461..d20a2839 100644 --- a/services/jenkins/configmap-plugins.yaml +++ b/services/jenkins/configmap-plugins.yaml @@ -9,10 +9,8 @@ data: kubernetes workflow-aggregator git - git-branch-source pipeline-utility-steps configuration-as-code - configuration-as-code-support oic-auth job-dsl simple-theme-plugin From 2aa4bd1fe18efc9e9c3108ace5e434c403f8217d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 10:15:33 -0300 Subject: [PATCH 041/416] jenkins: restore multibranch + webhook token --- services/jenkins/configmap-jcasc.yaml | 40 +++++++++++++++++-------- services/jenkins/configmap-plugins.yaml | 22 +++++++++----- services/jenkins/deployment.yaml | 17 ++++++----- 3 files changed, 52 insertions(+), 27 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 9e116c0e..ca3a7228 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -158,24 +158,40 @@ data: } } } - pipelineJob('titan-iac-quality-gate') { - triggers { - scm('H/12 * * * *') - } - definition { - cpsScm { - scm { + multibranchPipelineJob('titan-iac-quality-gate') { + branchSources { + branchSource { + source { git { - remote { - url('https://scm.bstein.dev/bstein/titan-iac.git') - credentials('gitea-pat') - } - branches('*/main') + id('titan-iac-quality-gate') + remote('https://scm.bstein.dev/bstein/titan-iac.git') + credentialsId('gitea-pat') } } + } + } + factory { + workflowBranchProjectFactory { scriptPath('ci/Jenkinsfile.titan-iac') } } + orphanedItemStrategy { + discardOldItems { + numToKeep(30) + } + } + triggers { + periodicFolderTrigger { + interval('12h') + } + } + configure { node -> + def token = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: '' + def triggers = node / 'triggers' + triggers << 'com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger' { + token(token) + } + } } base.yaml: | jenkins: diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml index d20a2839..35295126 100644 --- a/services/jenkins/configmap-plugins.yaml +++ b/services/jenkins/configmap-plugins.yaml @@ -6,11 +6,17 @@ metadata: namespace: jenkins data: plugins.txt: | - kubernetes - workflow-aggregator - git - pipeline-utility-steps - configuration-as-code - oic-auth - job-dsl - simple-theme-plugin + kubernetes:4416.v2ea_b_5372da_a_e + workflow-aggregator:608.v67378e9d3db_1 + git:5.8.1 + pipeline-utility-steps:2.20.0 + configuration-as-code:2031.veb_a_fdda_b_3ffd + oic-auth:4.626.ve5a_d9f26c051 + job-dsl:1.93 + simple-theme-plugin:230.v8b_fd91b_b_800c + workflow-multibranch:821.vc3b_4ea_780798 + branch-api:2.1268.v044a_87612da_8 + scm-api:724.v7d839074eb_5c + gitea:268.v75e47974c01d + gitea-checks:603.621.vc708da_fb_371d + multibranch-scan-webhook-trigger:1.0.11 diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index fdb8d107..c82a6af9 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -22,23 +22,26 @@ spec: vault.hashicorp.com/role: "jenkins" vault.hashicorp.com/agent-inject-secret-jenkins-env: "kv/data/atlas/jenkins/jenkins-oidc" vault.hashicorp.com/agent-inject-template-jenkins-env: | - {{- with secret "kv/data/atlas/jenkins/jenkins-oidc" -}} + {{ with secret "kv/data/atlas/jenkins/jenkins-oidc" }} OIDC_CLIENT_ID={{ .Data.data.clientId }} OIDC_CLIENT_SECRET={{ .Data.data.clientSecret }} OIDC_AUTH_URL={{ .Data.data.authorizationUrl }} OIDC_TOKEN_URL={{ .Data.data.tokenUrl }} OIDC_USERINFO_URL={{ .Data.data.userInfoUrl }} OIDC_LOGOUT_URL={{ .Data.data.logoutUrl }} - {{- end }} - {{- with secret "kv/data/atlas/jenkins/harbor-robot-creds" -}} + {{ end }} + {{ with secret "kv/data/atlas/jenkins/harbor-robot-creds" }} HARBOR_ROBOT_USERNAME={{ .Data.data.username }} HARBOR_ROBOT_PASSWORD={{ .Data.data.password }} - {{- end }} - {{- with secret "kv/data/atlas/jenkins/gitea-pat" -}} + {{ end }} + {{ with secret "kv/data/atlas/jenkins/gitea-pat" }} GITEA_PAT_USERNAME={{ .Data.data.username }} GITEA_PAT_TOKEN={{ .Data.data.token }} - {{- end -}} - bstein.dev/restarted-at: "2026-01-20T05:05:00Z" + {{ end }} + {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} + TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} + {{ end }} + bstein.dev/restarted-at: "2026-01-20T13:10:00Z" spec: serviceAccountName: jenkins nodeSelector: From 6d83204c9c5e212849634fd01c57f3b2ab8d6473 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 10:23:08 -0300 Subject: [PATCH 042/416] jenkins: pin oic-auth for core 2.528.3 --- services/jenkins/configmap-plugins.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml index 35295126..1c43cfb2 100644 --- a/services/jenkins/configmap-plugins.yaml +++ b/services/jenkins/configmap-plugins.yaml @@ -11,7 +11,7 @@ data: git:5.8.1 pipeline-utility-steps:2.20.0 configuration-as-code:2031.veb_a_fdda_b_3ffd - oic-auth:4.626.ve5a_d9f26c051 + oic-auth:4.609.v9de140f63d01 job-dsl:1.93 simple-theme-plugin:230.v8b_fd91b_b_800c workflow-multibranch:821.vc3b_4ea_780798 From 7c9f7da361e3175dd59382c393d0b42ed5c799d7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 10:31:30 -0300 Subject: [PATCH 043/416] jenkins: fix webhook trigger DSL --- services/jenkins/configmap-jcasc.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index ca3a7228..7e6df319 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -186,11 +186,10 @@ data: } } configure { node -> - def token = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: '' + def webhookToken = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: '' def triggers = node / 'triggers' - triggers << 'com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger' { - token(token) - } + def webhook = triggers.appendNode('com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger') + webhook.appendNode('token', webhookToken) } } base.yaml: | From a7bc174db1844bd4eef8e7ebc8f74f570c0bef8d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 10:37:57 -0300 Subject: [PATCH 044/416] jenkins: clean legacy quality-gate job --- services/jenkins/deployment.yaml | 2 +- services/jenkins/kustomization.yaml | 1 + services/jenkins/scripts/job_cleanup.groovy | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 services/jenkins/scripts/job_cleanup.groovy diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index c82a6af9..c71812ae 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -41,7 +41,7 @@ spec: {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T13:10:00Z" + bstein.dev/restarted-at: "2026-01-20T13:45:00Z" spec: serviceAccountName: jenkins nodeSelector: diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index acb6fb43..987e842b 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -16,6 +16,7 @@ configMapGenerator: - name: jenkins-init-scripts namespace: jenkins files: + - job_cleanup.groovy=scripts/job_cleanup.groovy - theme.groovy=scripts/theme.groovy options: disableNameSuffixHash: true diff --git a/services/jenkins/scripts/job_cleanup.groovy b/services/jenkins/scripts/job_cleanup.groovy new file mode 100644 index 00000000..f123c6b7 --- /dev/null +++ b/services/jenkins/scripts/job_cleanup.groovy @@ -0,0 +1,13 @@ +import jenkins.branch.MultiBranchProject +import jenkins.model.Jenkins + +def jenkins = Jenkins.instance +if (jenkins == null) { + return +} + +def legacy = jenkins.getItemByFullName('titan-iac-quality-gate') +if (legacy != null && !(legacy instanceof MultiBranchProject)) { + legacy.delete() + println("Deleted legacy job titan-iac-quality-gate (non-multibranch)") +} From ee5a4aedac58cbf44a37dbdac2c77c9dec4005b5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 10:59:51 -0300 Subject: [PATCH 045/416] jenkins: drop legacy cleanup and update triggers --- services/jenkins/configmap-jcasc.yaml | 40 +++++++++++++++------ services/jenkins/deployment.yaml | 2 +- services/jenkins/kustomization.yaml | 1 - services/jenkins/scripts/job_cleanup.groovy | 13 ------- 4 files changed, 31 insertions(+), 25 deletions(-) delete mode 100644 services/jenkins/scripts/job_cleanup.groovy diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 7e6df319..ba0ac810 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -49,8 +49,12 @@ data: jobs: - script: | pipelineJob('harbor-arm-build') { - triggers { - scm('H/5 * * * *') + properties { + pipelineTriggers { + triggers { + scm('H/5 * * * *') + } + } } definition { cpsScm { @@ -83,8 +87,12 @@ data: } } pipelineJob('ci-demo') { - triggers { - scm('H/1 * * * *') + properties { + pipelineTriggers { + triggers { + scm('H/1 * * * *') + } + } } definition { cpsScm { @@ -102,8 +110,12 @@ data: } } pipelineJob('bstein-dev-home') { - triggers { - scm('H/2 * * * *') + properties { + pipelineTriggers { + triggers { + scm('H/2 * * * *') + } + } } definition { cpsScm { @@ -121,8 +133,12 @@ data: } } pipelineJob('ariadne') { - triggers { - scm('H/2 * * * *') + properties { + pipelineTriggers { + triggers { + scm('H/2 * * * *') + } + } } definition { cpsScm { @@ -140,8 +156,12 @@ data: } } pipelineJob('data-prepper') { - triggers { - scm('H/5 * * * *') + properties { + pipelineTriggers { + triggers { + scm('H/5 * * * *') + } + } } definition { cpsScm { diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index c71812ae..9e83686e 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -41,7 +41,7 @@ spec: {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T13:45:00Z" + bstein.dev/restarted-at: "2026-01-20T14:05:00Z" spec: serviceAccountName: jenkins nodeSelector: diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index 987e842b..acb6fb43 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -16,7 +16,6 @@ configMapGenerator: - name: jenkins-init-scripts namespace: jenkins files: - - job_cleanup.groovy=scripts/job_cleanup.groovy - theme.groovy=scripts/theme.groovy options: disableNameSuffixHash: true diff --git a/services/jenkins/scripts/job_cleanup.groovy b/services/jenkins/scripts/job_cleanup.groovy deleted file mode 100644 index f123c6b7..00000000 --- a/services/jenkins/scripts/job_cleanup.groovy +++ /dev/null @@ -1,13 +0,0 @@ -import jenkins.branch.MultiBranchProject -import jenkins.model.Jenkins - -def jenkins = Jenkins.instance -if (jenkins == null) { - return -} - -def legacy = jenkins.getItemByFullName('titan-iac-quality-gate') -if (legacy != null && !(legacy instanceof MultiBranchProject)) { - legacy.delete() - println("Deleted legacy job titan-iac-quality-gate (non-multibranch)") -} From 33c329a494a4d356d01b99c5ea48f26d5ab1820c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 11:07:54 -0300 Subject: [PATCH 046/416] jenkins: use pollSCM for pipeline triggers --- services/jenkins/configmap-jcasc.yaml | 10 +++++----- services/jenkins/deployment.yaml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index ba0ac810..71826ff0 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -52,7 +52,7 @@ data: properties { pipelineTriggers { triggers { - scm('H/5 * * * *') + pollSCM('H/5 * * * *') } } } @@ -90,7 +90,7 @@ data: properties { pipelineTriggers { triggers { - scm('H/1 * * * *') + pollSCM('H/1 * * * *') } } } @@ -113,7 +113,7 @@ data: properties { pipelineTriggers { triggers { - scm('H/2 * * * *') + pollSCM('H/2 * * * *') } } } @@ -136,7 +136,7 @@ data: properties { pipelineTriggers { triggers { - scm('H/2 * * * *') + pollSCM('H/2 * * * *') } } } @@ -159,7 +159,7 @@ data: properties { pipelineTriggers { triggers { - scm('H/5 * * * *') + pollSCM('H/5 * * * *') } } } diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 9e83686e..cab36211 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -41,7 +41,7 @@ spec: {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T14:05:00Z" + bstein.dev/restarted-at: "2026-01-20T14:15:00Z" spec: serviceAccountName: jenkins nodeSelector: From dfe2faef5c09fa5ae86e8195d28646b507cad0a9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 11:14:29 -0300 Subject: [PATCH 047/416] jenkins: use scmTrigger for pipeline polls --- services/jenkins/configmap-jcasc.yaml | 25 ++++++++++++++++++++----- services/jenkins/deployment.yaml | 2 +- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 71826ff0..aa279e91 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -52,7 +52,10 @@ data: properties { pipelineTriggers { triggers { - pollSCM('H/5 * * * *') + scmTrigger { + spec('H/5 * * * *') + ignorePostCommitHooks(false) + } } } } @@ -90,7 +93,10 @@ data: properties { pipelineTriggers { triggers { - pollSCM('H/1 * * * *') + scmTrigger { + spec('H/1 * * * *') + ignorePostCommitHooks(false) + } } } } @@ -113,7 +119,10 @@ data: properties { pipelineTriggers { triggers { - pollSCM('H/2 * * * *') + scmTrigger { + spec('H/2 * * * *') + ignorePostCommitHooks(false) + } } } } @@ -136,7 +145,10 @@ data: properties { pipelineTriggers { triggers { - pollSCM('H/2 * * * *') + scmTrigger { + spec('H/2 * * * *') + ignorePostCommitHooks(false) + } } } } @@ -159,7 +171,10 @@ data: properties { pipelineTriggers { triggers { - pollSCM('H/5 * * * *') + scmTrigger { + spec('H/5 * * * *') + ignorePostCommitHooks(false) + } } } } diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index cab36211..7706807b 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -41,7 +41,7 @@ spec: {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T14:15:00Z" + bstein.dev/restarted-at: "2026-01-20T14:25:00Z" spec: serviceAccountName: jenkins nodeSelector: From a0ff159cabaa204fec41ff09d2305e9b43740a9c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 11:23:06 -0300 Subject: [PATCH 048/416] jenkins: fix scmTrigger spec field --- services/jenkins/configmap-jcasc.yaml | 10 +++++----- services/jenkins/deployment.yaml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index aa279e91..e29c1436 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -53,7 +53,7 @@ data: pipelineTriggers { triggers { scmTrigger { - spec('H/5 * * * *') + scmpoll_spec('H/5 * * * *') ignorePostCommitHooks(false) } } @@ -94,7 +94,7 @@ data: pipelineTriggers { triggers { scmTrigger { - spec('H/1 * * * *') + scmpoll_spec('H/1 * * * *') ignorePostCommitHooks(false) } } @@ -120,7 +120,7 @@ data: pipelineTriggers { triggers { scmTrigger { - spec('H/2 * * * *') + scmpoll_spec('H/2 * * * *') ignorePostCommitHooks(false) } } @@ -146,7 +146,7 @@ data: pipelineTriggers { triggers { scmTrigger { - spec('H/2 * * * *') + scmpoll_spec('H/2 * * * *') ignorePostCommitHooks(false) } } @@ -172,7 +172,7 @@ data: pipelineTriggers { triggers { scmTrigger { - spec('H/5 * * * *') + scmpoll_spec('H/5 * * * *') ignorePostCommitHooks(false) } } diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 7706807b..44925798 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -41,7 +41,7 @@ spec: {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T14:25:00Z" + bstein.dev/restarted-at: "2026-01-20T14:35:00Z" spec: serviceAccountName: jenkins nodeSelector: From e81eb57af3077232443af0cba16a30854f87512c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 11:54:15 -0300 Subject: [PATCH 049/416] jenkins: automate notifyCommit token --- services/jenkins/deployment.yaml | 3 +- services/jenkins/kustomization.yaml | 1 + .../jenkins/scripts/git-notify-token.groovy | 41 +++++++++++++++++++ services/jenkins/scripts/theme.groovy | 1 - 4 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 services/jenkins/scripts/git-notify-token.groovy diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 44925798..b5b3de63 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -40,8 +40,9 @@ spec: {{ end }} {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }} TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }} + GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }} {{ end }} - bstein.dev/restarted-at: "2026-01-20T14:35:00Z" + bstein.dev/restarted-at: "2026-01-20T14:52:41Z" spec: serviceAccountName: jenkins nodeSelector: diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index acb6fb43..0a03f5b5 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -16,6 +16,7 @@ configMapGenerator: - name: jenkins-init-scripts namespace: jenkins files: + - git-notify-token.groovy=scripts/git-notify-token.groovy - theme.groovy=scripts/theme.groovy options: disableNameSuffixHash: true diff --git a/services/jenkins/scripts/git-notify-token.groovy b/services/jenkins/scripts/git-notify-token.groovy new file mode 100644 index 00000000..336c918a --- /dev/null +++ b/services/jenkins/scripts/git-notify-token.groovy @@ -0,0 +1,41 @@ +import hudson.plugins.git.ApiTokenPropertyConfiguration +import hudson.Util +import java.nio.charset.StandardCharsets +import java.security.MessageDigest + + +def entries = [ + [env: 'GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME', name: 'gitea-bstein-dev-home'], +] + +entries.each { entry -> + def token = System.getenv(entry.env) + if (!token || token.trim().isEmpty()) { + println("Git notifyCommit token ${entry.env} missing; skipping") + return + } + + try { + def config = ApiTokenPropertyConfiguration.get() + if (config.hasMatchingApiToken(token)) { + println("Git notifyCommit token ${entry.name} already configured") + return + } + + def digest = MessageDigest.getInstance("SHA-256") + def hash = Util.toHexString(digest.digest(token.getBytes(StandardCharsets.US_ASCII))) + + def field = ApiTokenPropertyConfiguration.class.getDeclaredField("apiTokens") + field.setAccessible(true) + def tokens = field.get(config) + + def ctor = ApiTokenPropertyConfiguration.HashedApiToken.class.getDeclaredConstructor(String.class, String.class) + ctor.setAccessible(true) + tokens.add(ctor.newInstance(entry.name, hash)) + config.save() + + println("Added git notifyCommit access token ${entry.name}") + } catch (Throwable e) { + println("Failed to configure git notifyCommit token ${entry.name}: ${e.class.simpleName}: ${e.message}") + } +} diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy index cf171f74..5950bf44 100644 --- a/services/jenkins/scripts/theme.groovy +++ b/services/jenkins/scripts/theme.groovy @@ -8,7 +8,6 @@ if (decorators?.size() > 0) { def theme = decorators[0] theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css") theme.setJsUrl("") - theme.setTheme("") instance.save() println("Applied simple-theme-plugin dark theme") } else { From f3b8b93287682b437b54b11646f536d2c9bf9b52 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:04:24 -0300 Subject: [PATCH 050/416] jenkins: move agent workspace off node disk --- services/jenkins/cache-pvc.yaml | 13 +++++++++++++ services/jenkins/configmap-jcasc.yaml | 5 +++++ services/jenkins/deployment.yaml | 6 ++++-- services/jenkins/kustomization.yaml | 2 ++ services/jenkins/plugins-pvc.yaml | 13 +++++++++++++ 5 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 services/jenkins/cache-pvc.yaml create mode 100644 services/jenkins/plugins-pvc.yaml diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml new file mode 100644 index 00000000..784c7d8d --- /dev/null +++ b/services/jenkins/cache-pvc.yaml @@ -0,0 +1,13 @@ +# services/jenkins/cache-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: jenkins-cache + namespace: jenkins +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + storageClassName: astreae diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index e29c1436..f485de81 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -258,6 +258,11 @@ data: templates: - name: "default" namespace: "jenkins" + workspaceVolume: + dynamicPVC: + accessModes: "ReadWriteOnce" + requestsSize: "5Gi" + storageClassName: "astreae" containers: - name: "jnlp" args: "^${computer.jnlpmac} ^${computer.name}" diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index b5b3de63..7ee1aad4 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -161,9 +161,11 @@ spec: persistentVolumeClaim: claimName: jenkins - name: jenkins-cache - emptyDir: {} + persistentVolumeClaim: + claimName: jenkins-cache - name: plugin-dir - emptyDir: {} + persistentVolumeClaim: + claimName: jenkins-plugins - name: plugins configMap: name: jenkins-plugins diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index 0a03f5b5..aab859ab 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -6,6 +6,8 @@ resources: - namespace.yaml - serviceaccount.yaml - pvc.yaml + - cache-pvc.yaml + - plugins-pvc.yaml - configmap-jcasc.yaml - configmap-plugins.yaml - deployment.yaml diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml new file mode 100644 index 00000000..45a967bb --- /dev/null +++ b/services/jenkins/plugins-pvc.yaml @@ -0,0 +1,13 @@ +# services/jenkins/plugins-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: jenkins-plugins + namespace: jenkins +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + storageClassName: astreae From 5e2370aeaac5f21ebdf4f4c90226ad579b225e67 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:09:23 -0300 Subject: [PATCH 051/416] jenkins: expand pvc sizes and move /tmp to memory --- services/jenkins/cache-pvc.yaml | 2 +- services/jenkins/configmap-jcasc.yaml | 2 +- services/jenkins/deployment.yaml | 3 ++- services/jenkins/plugins-pvc.yaml | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml index 784c7d8d..75383059 100644 --- a/services/jenkins/cache-pvc.yaml +++ b/services/jenkins/cache-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 5Gi + storage: 50Gi storageClassName: astreae diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index f485de81..5ee6a3e7 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -261,7 +261,7 @@ data: workspaceVolume: dynamicPVC: accessModes: "ReadWriteOnce" - requestsSize: "5Gi" + requestsSize: "50Gi" storageClassName: "astreae" containers: - name: "jnlp" diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 7ee1aad4..5f50084e 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -176,4 +176,5 @@ spec: configMap: name: jenkins-init-scripts - name: tmp - emptyDir: {} + emptyDir: + medium: Memory diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml index 45a967bb..2812c7a6 100644 --- a/services/jenkins/plugins-pvc.yaml +++ b/services/jenkins/plugins-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 2Gi + storage: 20Gi storageClassName: astreae From 0850fd86ee821f2769fd9580687923842f358fdb Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:19:58 -0300 Subject: [PATCH 052/416] jenkins: right-size pvc requests --- services/jenkins/cache-pvc.yaml | 2 +- services/jenkins/configmap-jcasc.yaml | 2 +- services/jenkins/plugins-pvc.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml index 75383059..79e8decb 100644 --- a/services/jenkins/cache-pvc.yaml +++ b/services/jenkins/cache-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 50Gi + storage: 20Gi storageClassName: astreae diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 5ee6a3e7..c2144fa0 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -261,7 +261,7 @@ data: workspaceVolume: dynamicPVC: accessModes: "ReadWriteOnce" - requestsSize: "50Gi" + requestsSize: "20Gi" storageClassName: "astreae" containers: - name: "jnlp" diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml index 2812c7a6..e26d07fd 100644 --- a/services/jenkins/plugins-pvc.yaml +++ b/services/jenkins/plugins-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 20Gi + storage: 10Gi storageClassName: astreae From fc240e34fe67a8c0ad90a8cd057e7f7309ffdbc9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:21:42 -0300 Subject: [PATCH 053/416] jenkins: keep cache/plugin pvc sizes to avoid shrink --- services/jenkins/cache-pvc.yaml | 2 +- services/jenkins/plugins-pvc.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml index 79e8decb..75383059 100644 --- a/services/jenkins/cache-pvc.yaml +++ b/services/jenkins/cache-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 20Gi + storage: 50Gi storageClassName: astreae diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml index e26d07fd..2812c7a6 100644 --- a/services/jenkins/plugins-pvc.yaml +++ b/services/jenkins/plugins-pvc.yaml @@ -9,5 +9,5 @@ spec: - ReadWriteOnce resources: requests: - storage: 10Gi + storage: 20Gi storageClassName: astreae From 3cd38a6c701e9897a0d6e602bced499ec9905199 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:32:27 -0300 Subject: [PATCH 054/416] jenkins: rotate cache/plugin pvcs --- services/jenkins/cache-pvc.yaml | 4 ++-- services/jenkins/deployment.yaml | 4 ++-- services/jenkins/plugins-pvc.yaml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml index 75383059..a9ed319f 100644 --- a/services/jenkins/cache-pvc.yaml +++ b/services/jenkins/cache-pvc.yaml @@ -2,12 +2,12 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: jenkins-cache + name: jenkins-cache-v2 namespace: jenkins spec: accessModes: - ReadWriteOnce resources: requests: - storage: 50Gi + storage: 20Gi storageClassName: astreae diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 5f50084e..9f8fe99f 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -162,10 +162,10 @@ spec: claimName: jenkins - name: jenkins-cache persistentVolumeClaim: - claimName: jenkins-cache + claimName: jenkins-cache-v2 - name: plugin-dir persistentVolumeClaim: - claimName: jenkins-plugins + claimName: jenkins-plugins-v2 - name: plugins configMap: name: jenkins-plugins diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml index 2812c7a6..06715eb4 100644 --- a/services/jenkins/plugins-pvc.yaml +++ b/services/jenkins/plugins-pvc.yaml @@ -2,12 +2,12 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: jenkins-plugins + name: jenkins-plugins-v2 namespace: jenkins spec: accessModes: - ReadWriteOnce resources: requests: - storage: 20Gi + storage: 10Gi storageClassName: astreae From 50dcded32f09cdff6a99cb924b99c0e0faccf059 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:43:23 -0300 Subject: [PATCH 055/416] jenkins: add local dark theme css --- services/jenkins/deployment.yaml | 4 + services/jenkins/kustomization.yaml | 1 + services/jenkins/scripts/jenkins-theme.css | 97 ++++++++++++++++++++++ services/jenkins/scripts/theme.groovy | 2 +- 4 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 services/jenkins/scripts/jenkins-theme.css diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 9f8fe99f..b69f134a 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -94,6 +94,7 @@ spec: - -c - | set -e + mkdir -p /var/jenkins_home/userContent exec env $(cat /vault/secrets/jenkins-env) /usr/bin/tini -- /usr/local/bin/jenkins.sh ports: - name: http @@ -152,6 +153,9 @@ spec: mountPath: /config/jcasc - name: init-scripts mountPath: /usr/share/jenkins/ref/init.groovy.d + - name: init-scripts + mountPath: /var/jenkins_home/userContent/jenkins-theme.css + subPath: jenkins-theme.css - name: plugin-dir mountPath: /usr/share/jenkins/ref/plugins - name: tmp diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index aab859ab..444dd6de 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -18,6 +18,7 @@ configMapGenerator: - name: jenkins-init-scripts namespace: jenkins files: + - jenkins-theme.css=scripts/jenkins-theme.css - git-notify-token.groovy=scripts/git-notify-token.groovy - theme.groovy=scripts/theme.groovy options: diff --git a/services/jenkins/scripts/jenkins-theme.css b/services/jenkins/scripts/jenkins-theme.css new file mode 100644 index 00000000..56fe193f --- /dev/null +++ b/services/jenkins/scripts/jenkins-theme.css @@ -0,0 +1,97 @@ +@import url("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css"); + +:root { + --atlas-bg: #0f1216; + --atlas-surface: #171b21; + --atlas-surface-alt: #1f252d; + --atlas-border: #2b313b; + --atlas-text: #e6e9ef; + --atlas-text-muted: #b3bac6; + --atlas-link: #8fb7ff; +} + +body, +#page-body, +#page-header, +#header, +#main-panel, +#main-panel-content, +#side-panel, +.top-sticker-inner, +.bottom-sticker-inner, +#breadcrumbBar, +#breadcrumbs { + background-color: var(--atlas-bg) !important; + color: var(--atlas-text) !important; +} + +#side-panel .task-link, +#breadcrumbs a, +#breadcrumbs, +#projectstatus th a, +#projectstatus td, +#projectstatus th { + color: var(--atlas-text-muted) !important; +} + +a, +a:visited, +a:link { + color: var(--atlas-link) !important; +} + +a:hover { + opacity: 0.85; +} + +#main-panel, +#main-panel-content, +#description, +.pane, +table.pane { + background-color: var(--atlas-surface) !important; + color: var(--atlas-text) !important; +} + +table.pane tr:nth-child(odd) td { + background-color: var(--atlas-surface) !important; +} + +table.pane tr:nth-child(even) td, +#projectstatus tr:hover td { + background-color: var(--atlas-surface-alt) !important; +} + +input, +select, +textarea, +#search-box { + background-color: var(--atlas-surface-alt) !important; + color: var(--atlas-text) !important; + border-color: var(--atlas-border) !important; +} + +#header, +#page-header { + background-color: #202734 !important; +} + +#header .login, +#page-header .login { + color: var(--atlas-text) !important; +} + +#side-panel .task-link, +#side-panel .task-link:visited, +#side-panel .task-link:hover { + color: var(--atlas-text) !important; +} + +#footer { + background-color: var(--atlas-bg) !important; + color: var(--atlas-text-muted) !important; +} + +.jenkins_ver:after { + content: "atlas dark"; +} diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy index 5950bf44..fd12474e 100644 --- a/services/jenkins/scripts/theme.groovy +++ b/services/jenkins/scripts/theme.groovy @@ -6,7 +6,7 @@ def decorators = instance.getExtensionList(SimpleThemeDecorator.class) if (decorators?.size() > 0) { def theme = decorators[0] - theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css") + theme.setCssUrl("https://ci.bstein.dev/userContent/jenkins-theme.css") theme.setJsUrl("") instance.save() println("Applied simple-theme-plugin dark theme") From fced9f5919a0fe477b24cbefce53d43aeac5fd94 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 17:54:47 -0300 Subject: [PATCH 056/416] jenkins: mount init scripts into home --- services/jenkins/deployment.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index b69f134a..7dff5afd 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -153,6 +153,8 @@ spec: mountPath: /config/jcasc - name: init-scripts mountPath: /usr/share/jenkins/ref/init.groovy.d + - name: init-scripts + mountPath: /var/jenkins_home/init.groovy.d - name: init-scripts mountPath: /var/jenkins_home/userContent/jenkins-theme.css subPath: jenkins-theme.css From a01dbbd7df85233feddbec7d63d84939f65cf9d0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 18:00:36 -0300 Subject: [PATCH 057/416] jenkins: inline dark theme css --- services/jenkins/deployment.yaml | 4 - services/jenkins/kustomization.yaml | 1 - services/jenkins/scripts/jenkins-theme.css | 97 --------------------- services/jenkins/scripts/theme.groovy | 99 +++++++++++++++++++++- 4 files changed, 98 insertions(+), 103 deletions(-) delete mode 100644 services/jenkins/scripts/jenkins-theme.css diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 7dff5afd..0b62ee09 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -94,7 +94,6 @@ spec: - -c - | set -e - mkdir -p /var/jenkins_home/userContent exec env $(cat /vault/secrets/jenkins-env) /usr/bin/tini -- /usr/local/bin/jenkins.sh ports: - name: http @@ -155,9 +154,6 @@ spec: mountPath: /usr/share/jenkins/ref/init.groovy.d - name: init-scripts mountPath: /var/jenkins_home/init.groovy.d - - name: init-scripts - mountPath: /var/jenkins_home/userContent/jenkins-theme.css - subPath: jenkins-theme.css - name: plugin-dir mountPath: /usr/share/jenkins/ref/plugins - name: tmp diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index 444dd6de..aab859ab 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -18,7 +18,6 @@ configMapGenerator: - name: jenkins-init-scripts namespace: jenkins files: - - jenkins-theme.css=scripts/jenkins-theme.css - git-notify-token.groovy=scripts/git-notify-token.groovy - theme.groovy=scripts/theme.groovy options: diff --git a/services/jenkins/scripts/jenkins-theme.css b/services/jenkins/scripts/jenkins-theme.css deleted file mode 100644 index 56fe193f..00000000 --- a/services/jenkins/scripts/jenkins-theme.css +++ /dev/null @@ -1,97 +0,0 @@ -@import url("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css"); - -:root { - --atlas-bg: #0f1216; - --atlas-surface: #171b21; - --atlas-surface-alt: #1f252d; - --atlas-border: #2b313b; - --atlas-text: #e6e9ef; - --atlas-text-muted: #b3bac6; - --atlas-link: #8fb7ff; -} - -body, -#page-body, -#page-header, -#header, -#main-panel, -#main-panel-content, -#side-panel, -.top-sticker-inner, -.bottom-sticker-inner, -#breadcrumbBar, -#breadcrumbs { - background-color: var(--atlas-bg) !important; - color: var(--atlas-text) !important; -} - -#side-panel .task-link, -#breadcrumbs a, -#breadcrumbs, -#projectstatus th a, -#projectstatus td, -#projectstatus th { - color: var(--atlas-text-muted) !important; -} - -a, -a:visited, -a:link { - color: var(--atlas-link) !important; -} - -a:hover { - opacity: 0.85; -} - -#main-panel, -#main-panel-content, -#description, -.pane, -table.pane { - background-color: var(--atlas-surface) !important; - color: var(--atlas-text) !important; -} - -table.pane tr:nth-child(odd) td { - background-color: var(--atlas-surface) !important; -} - -table.pane tr:nth-child(even) td, -#projectstatus tr:hover td { - background-color: var(--atlas-surface-alt) !important; -} - -input, -select, -textarea, -#search-box { - background-color: var(--atlas-surface-alt) !important; - color: var(--atlas-text) !important; - border-color: var(--atlas-border) !important; -} - -#header, -#page-header { - background-color: #202734 !important; -} - -#header .login, -#page-header .login { - color: var(--atlas-text) !important; -} - -#side-panel .task-link, -#side-panel .task-link:visited, -#side-panel .task-link:hover { - color: var(--atlas-text) !important; -} - -#footer { - background-color: var(--atlas-bg) !important; - color: var(--atlas-text-muted) !important; -} - -.jenkins_ver:after { - content: "atlas dark"; -} diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy index fd12474e..b20169cb 100644 --- a/services/jenkins/scripts/theme.groovy +++ b/services/jenkins/scripts/theme.groovy @@ -6,7 +6,104 @@ def decorators = instance.getExtensionList(SimpleThemeDecorator.class) if (decorators?.size() > 0) { def theme = decorators[0] - theme.setCssUrl("https://ci.bstein.dev/userContent/jenkins-theme.css") + theme.setCssUrl("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css") + theme.setCssRules(""" +:root { + --atlas-bg: #0f1216; + --atlas-surface: #171b21; + --atlas-surface-alt: #1f252d; + --atlas-border: #2b313b; + --atlas-text: #e6e9ef; + --atlas-text-muted: #b3bac6; + --atlas-link: #8fb7ff; +} + +body, +#page-body, +#page-header, +#header, +#main-panel, +#main-panel-content, +#side-panel, +.top-sticker-inner, +.bottom-sticker-inner, +#breadcrumbBar, +#breadcrumbs { + background-color: var(--atlas-bg) !important; + color: var(--atlas-text) !important; +} + +#side-panel .task-link, +#breadcrumbs a, +#breadcrumbs, +#projectstatus th a, +#projectstatus td, +#projectstatus th { + color: var(--atlas-text-muted) !important; +} + +a, +a:visited, +a:link { + color: var(--atlas-link) !important; +} + +a:hover { + opacity: 0.85; +} + +#main-panel, +#main-panel-content, +#description, +.pane, +table.pane { + background-color: var(--atlas-surface) !important; + color: var(--atlas-text) !important; +} + +table.pane tr:nth-child(odd) td { + background-color: var(--atlas-surface) !important; +} + +table.pane tr:nth-child(even) td, +#projectstatus tr:hover td { + background-color: var(--atlas-surface-alt) !important; +} + +input, +select, +textarea, +#search-box { + background-color: var(--atlas-surface-alt) !important; + color: var(--atlas-text) !important; + border-color: var(--atlas-border) !important; +} + +#header, +#page-header { + background-color: #202734 !important; +} + +#header .login, +#page-header .login { + color: var(--atlas-text) !important; +} + +#side-panel .task-link, +#side-panel .task-link:visited, +#side-panel .task-link:hover { + color: var(--atlas-text) !important; +} + +#footer { + background-color: var(--atlas-bg) !important; + color: var(--atlas-text-muted) !important; +} + +.jenkins_ver:after { + content: "atlas dark"; +} +""".stripIndent().trim()) theme.setJsUrl("") instance.save() println("Applied simple-theme-plugin dark theme") From 3a6f1785cc410466eb629b2a2261f87b417da883 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 18:11:13 -0300 Subject: [PATCH 058/416] ci: add root Jenkinsfile and update keycloak ldap job --- Jenkinsfile | 77 ++++++++++++++++++++++ services/keycloak/ldap-federation-job.yaml | 50 +++++++++++++- 2 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 Jenkinsfile diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 00000000..4d6b23e6 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,77 @@ +// Mirror of ci/Jenkinsfile.titan-iac for multibranch discovery. +pipeline { + agent { + kubernetes { + defaultContainer 'python' + yaml """ +apiVersion: v1 +kind: Pod +spec: + nodeSelector: + hardware: rpi5 + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: python + image: python:3.12-slim + command: + - cat + tty: true +""" + } + } + environment { + PIP_DISABLE_PIP_VERSION_CHECK = '1' + PYTHONUNBUFFERED = '1' + } + stages { + stage('Checkout') { + steps { + checkout scm + } + } + stage('Install deps') { + steps { + sh 'pip install --no-cache-dir -r ci/requirements.txt' + } + } + stage('Glue tests') { + steps { + sh 'pytest -q ci/tests/glue' + } + } + stage('Resolve Flux branch') { + steps { + script { + env.FLUX_BRANCH = sh( + returnStdout: true, + script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml" + ).trim() + if (!env.FLUX_BRANCH) { + error('Flux branch not found in gotk-sync.yaml') + } + echo "Flux branch: ${env.FLUX_BRANCH}" + } + } + } + stage('Promote') { + when { + expression { + def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '') + return env.FLUX_BRANCH && branch == env.FLUX_BRANCH + } + } + steps { + withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) { + sh ''' + set +x + git config user.email "jenkins@bstein.dev" + git config user.name "jenkins" + git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git + git push origin HEAD:${FLUX_BRANCH} + ''' + } + } + } + } +} diff --git a/services/keycloak/ldap-federation-job.yaml b/services/keycloak/ldap-federation-job.yaml index 303fd9f5..3c3f1c19 100644 --- a/services/keycloak/ldap-federation-job.yaml +++ b/services/keycloak/ldap-federation-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: keycloak-ldap-federation-11 + name: keycloak-ldap-federation-12 namespace: sso spec: backoffLimit: 2 @@ -325,6 +325,54 @@ spec: if status not in (201, 204): raise SystemExit(f"Unexpected group mapper create status: {status}") + def ensure_user_attr_mapper(name: str, ldap_attr: str, user_attr: str): + mapper = None + for c in components: + if c.get("name") == name and c.get("parentId") == ldap_component_id: + mapper = c + break + + payload = { + "name": name, + "providerId": "user-attribute-ldap-mapper", + "providerType": "org.keycloak.storage.ldap.mappers.LDAPStorageMapper", + "parentId": ldap_component_id, + "config": { + "ldap.attribute": [ldap_attr], + "user.model.attribute": [user_attr], + "read.only": ["false"], + "always.read.value.from.ldap": ["false"], + "is.mandatory.in.ldap": ["false"], + }, + } + + if mapper: + payload["id"] = mapper["id"] + payload["parentId"] = mapper.get("parentId", payload["parentId"]) + print(f"Updating LDAP user mapper: {payload['id']} ({name})") + status, _, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/components/{payload['id']}", + token, + payload, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected user mapper update status for {name}: {status}") + else: + print(f"Creating LDAP user mapper: {name}") + status, _, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/components", + token, + payload, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected user mapper create status for {name}: {status}") + + ensure_user_attr_mapper("openldap-email", "mail", "email") + ensure_user_attr_mapper("openldap-first-name", "givenName", "firstName") + ensure_user_attr_mapper("openldap-last-name", "sn", "lastName") + # Cleanup duplicate LDAP federation providers and their child components (mappers, etc). # Keep only the canonical provider we updated/created above. try: From cd81dffd857cb62bc90d74a3f94759eb43a36176 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 18:13:49 -0300 Subject: [PATCH 059/416] jenkins: fix dark theme injection --- services/jenkins/scripts/theme.groovy | 140 +++++++++++++++----------- 1 file changed, 83 insertions(+), 57 deletions(-) diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy index b20169cb..58755c04 100644 --- a/services/jenkins/scripts/theme.groovy +++ b/services/jenkins/scripts/theme.groovy @@ -1,21 +1,46 @@ import jenkins.model.Jenkins import org.codefirst.SimpleThemeDecorator +import org.jenkinsci.plugins.simpletheme.CssTextThemeElement def instance = Jenkins.get() def decorators = instance.getExtensionList(SimpleThemeDecorator.class) if (decorators?.size() > 0) { def theme = decorators[0] - theme.setCssUrl("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css") - theme.setCssRules(""" -:root { - --atlas-bg: #0f1216; - --atlas-surface: #171b21; - --atlas-surface-alt: #1f252d; - --atlas-border: #2b313b; - --atlas-text: #e6e9ef; - --atlas-text-muted: #b3bac6; - --atlas-link: #8fb7ff; + def cssRules = """ +:root, +.app-theme-picker__picker[data-theme=none] { + --background: #0f1216 !important; + --header-background: #141922 !important; + --header-border: #2b313b !important; + --white: #141922 !important; + --black: #e6e9ef !important; + --very-light-grey: #171b21 !important; + --light-grey: #202734 !important; + --medium-grey: #2b313b !important; + --dark-grey: #0b0f14 !important; + --text-color: #e6e9ef !important; + --text-color-secondary: #a6adba !important; + --card-background: #171b21 !important; + --card-border-color: #2b313b !important; + --pane-header-bg: #1f252d !important; + --pane-header-border-color: #2b313b !important; + --pane-border-color: #2b313b !important; + --pane-text-color: #e6e9ef !important; + --pane-header-text-color: #e6e9ef !important; + --link-color: #8fb7ff !important; + --link-color--hover: #b0ccff !important; + --link-dark-color: #e6e9ef !important; + --link-dark-color--hover: #b0ccff !important; + --input-color: #151a20 !important; + --input-border: #2b313b !important; + --input-border-hover: #3a424d !important; + --button-background: #232a33 !important; + --button-background--hover: #2b313b !important; + --button-background--active: #323b46 !important; + --item-background--hover: #232a33 !important; + --item-background--active: #2b313b !important; + --accent-color: #8fb7ff !important; } body, @@ -29,83 +54,84 @@ body, .bottom-sticker-inner, #breadcrumbBar, #breadcrumbs { - background-color: var(--atlas-bg) !important; - color: var(--atlas-text) !important; + background-color: var(--background) !important; + color: var(--text-color) !important; } -#side-panel .task-link, -#breadcrumbs a, -#breadcrumbs, -#projectstatus th a, +.jenkins-card, +.jenkins-section, +.jenkins-section__item, +#main-panel .jenkins-card, +#main-panel .jenkins-section { + background-color: var(--card-background) !important; + color: var(--text-color) !important; + border-color: var(--card-border-color) !important; +} + +table.pane, +table.pane td, +table.pane th, #projectstatus td, #projectstatus th { - color: var(--atlas-text-muted) !important; -} - -a, -a:visited, -a:link { - color: var(--atlas-link) !important; -} - -a:hover { - opacity: 0.85; -} - -#main-panel, -#main-panel-content, -#description, -.pane, -table.pane { - background-color: var(--atlas-surface) !important; - color: var(--atlas-text) !important; -} - -table.pane tr:nth-child(odd) td { - background-color: var(--atlas-surface) !important; + background-color: var(--card-background) !important; + color: var(--text-color) !important; } table.pane tr:nth-child(even) td, #projectstatus tr:hover td { - background-color: var(--atlas-surface-alt) !important; + background-color: #1f252d !important; } input, select, textarea, #search-box { - background-color: var(--atlas-surface-alt) !important; - color: var(--atlas-text) !important; - border-color: var(--atlas-border) !important; + background-color: #151a20 !important; + color: var(--text-color) !important; + border-color: var(--input-border) !important; } -#header, -#page-header { - background-color: #202734 !important; +a, +a:visited, +a:link { + color: var(--link-color) !important; } -#header .login, -#page-header .login { - color: var(--atlas-text) !important; +a:hover { + opacity: 0.85; } #side-panel .task-link, -#side-panel .task-link:visited, -#side-panel .task-link:hover { - color: var(--atlas-text) !important; +#breadcrumbs a, +#breadcrumbs, +#projectstatus th a { + color: var(--text-color-secondary) !important; +} + +.console-output, +.console-output pre, +pre, +code, +.CodeMirror { + background-color: #0c0f14 !important; + color: #d9dee7 !important; } #footer { - background-color: var(--atlas-bg) !important; - color: var(--atlas-text-muted) !important; + background-color: var(--background) !important; + color: var(--text-color-secondary) !important; } .jenkins_ver:after { content: "atlas dark"; } -""".stripIndent().trim()) +""".stripIndent().trim() + + theme.setElements([new CssTextThemeElement(cssRules)]) + theme.setCssUrl("") + theme.setCssRules(cssRules) theme.setJsUrl("") - instance.save() + theme.save() println("Applied simple-theme-plugin dark theme") } else { println("simple-theme-plugin not installed; skipping theme configuration") From 587a0af1d7e30b6b5d094ce55fa6417cfabb47ea Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 20 Jan 2026 23:03:39 -0300 Subject: [PATCH 060/416] maintenance: wire ariadne db and dashboards --- scripts/dashboards_render_atlas.py | 25 +++ services/maintenance/ariadne-deployment.yaml | 165 ++++++++++++++---- services/maintenance/ariadne-rbac.yaml | 14 +- .../monitoring/dashboards/atlas-testing.json | 113 ++++++++++++ .../monitoring/grafana-dashboard-testing.yaml | 113 ++++++++++++ .../vault/scripts/vault_k8s_auth_configure.sh | 2 +- 6 files changed, 399 insertions(+), 33 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 116bf218..a3fb3727 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -340,6 +340,8 @@ ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{statu ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" +ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' +ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -2267,6 +2269,29 @@ def build_testing_dashboard(): instant=True, ) ) + panels.append( + stat_panel( + 10, + "Ariadne CI Coverage (%)", + ARIADNE_CI_COVERAGE, + {"h": 4, "w": 6, "x": 0, "y": 22}, + unit="percent", + decimals=1, + instant=True, + legend="{{branch}}", + ) + ) + panels.append( + table_panel( + 11, + "Ariadne CI Tests (latest)", + ARIADNE_CI_TESTS, + {"h": 6, "w": 18, "x": 6, "y": 22}, + unit="none", + transformations=sort_desc, + instant=True, + ) + ) return { "uid": "atlas-testing", diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index cd0d38c7..57ce72b7 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -20,14 +20,30 @@ spec: prometheus.io/path: "/metrics" vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "maintenance" - vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/portal/atlas-portal-db" + vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | - {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} - export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" + {{ with secret "kv/data/atlas/maintenance/ariadne-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.database_url }}" {{ end }} {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }} export KEYCLOAK_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" {{ end }} + {{ with secret "kv/data/atlas/nextcloud/nextcloud-db" }} + export NEXTCLOUD_DB_NAME="{{ .Data.data.database }}" + export NEXTCLOUD_DB_USER="{{ index .Data.data "db-username" }}" + export NEXTCLOUD_DB_PASSWORD="{{ index .Data.data "db-password" }}" + {{ end }} + {{ with secret "kv/data/atlas/nextcloud/nextcloud-admin" }} + export NEXTCLOUD_ADMIN_USER="{{ index .Data.data "admin-user" }}" + export NEXTCLOUD_ADMIN_PASSWORD="{{ index .Data.data "admin-password" }}" + {{ end }} + {{ with secret "kv/data/atlas/health/wger-admin" }} + export WGER_ADMIN_USERNAME="{{ .Data.data.username }}" + export WGER_ADMIN_PASSWORD="{{ .Data.data.password }}" + {{ end }} + {{ with secret "kv/data/atlas/finance/firefly-secrets" }} + export FIREFLY_CRON_TOKEN="{{ .Data.data.STATIC_CRON_TOKEN }}" + {{ end }} {{ with secret "kv/data/atlas/mailu/mailu-db-secret" }} export MAILU_DB_NAME="{{ .Data.data.database }}" export MAILU_DB_USER="{{ .Data.data.username }}" @@ -42,6 +58,35 @@ spec: export SMTP_PASSWORD="{{ .Data.data.password }}" export SMTP_FROM="no-reply-portal@bstein.dev" {{ end }} + {{ with secret "kv/data/atlas/comms/mas-admin-client-runtime" }} + export COMMS_MAS_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" + {{ end }} + {{ with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" }} + export COMMS_BOT_PASSWORD="{{ index .Data.data "bot-password" }}" + export COMMS_SEEDER_PASSWORD="{{ index .Data.data "seeder-password" }}" + {{ end }} + {{ with secret "kv/data/atlas/comms/synapse-db" }} + export COMMS_SYNAPSE_DB_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}" + {{ end }} + {{ with secret "kv/data/atlas/vault/vault-oidc-config" }} + export VAULT_OIDC_DISCOVERY_URL="{{ .Data.data.discovery_url }}" + export VAULT_OIDC_CLIENT_ID="{{ .Data.data.client_id }}" + export VAULT_OIDC_CLIENT_SECRET="{{ .Data.data.client_secret }}" + export VAULT_OIDC_DEFAULT_ROLE="{{ .Data.data.default_role }}" + export VAULT_OIDC_SCOPES="{{ .Data.data.scopes }}" + export VAULT_OIDC_USER_CLAIM="{{ .Data.data.user_claim }}" + export VAULT_OIDC_GROUPS_CLAIM="{{ .Data.data.groups_claim }}" + export VAULT_OIDC_TOKEN_POLICIES="{{ .Data.data.token_policies }}" + export VAULT_OIDC_ADMIN_GROUP="{{ .Data.data.admin_group }}" + export VAULT_OIDC_ADMIN_POLICIES="{{ .Data.data.admin_policies }}" + export VAULT_OIDC_DEV_GROUP="{{ .Data.data.dev_group }}" + export VAULT_OIDC_DEV_POLICIES="{{ .Data.data.dev_policies }}" + export VAULT_OIDC_USER_GROUP="{{ .Data.data.user_group }}" + export VAULT_OIDC_USER_POLICIES="{{ .Data.data.user_policies }}" + export VAULT_OIDC_REDIRECT_URIS="{{ .Data.data.redirect_uris }}" + export VAULT_OIDC_BOUND_AUDIENCES="{{ .Data.data.bound_audiences }}" + export VAULT_OIDC_BOUND_CLAIMS_TYPE="{{ .Data.data.bound_claims_type }}" + {{ end }} spec: serviceAccountName: ariadne nodeSelector: @@ -92,6 +137,8 @@ spec: value: dev - name: MAILU_DOMAIN value: bstein.dev + - name: MAILU_HOST + value: mail.bstein.dev - name: MAILU_SYNC_URL value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC @@ -102,46 +149,84 @@ spec: value: "5432" - name: NEXTCLOUD_NAMESPACE value: nextcloud - - name: NEXTCLOUD_MAIL_SYNC_CRONJOB - value: nextcloud-mail-sync - - name: NEXTCLOUD_MAIL_SYNC_WAIT_TIMEOUT_SEC - value: "90" - - name: NEXTCLOUD_MAIL_SYNC_JOB_TTL_SEC - value: "3600" + - name: NEXTCLOUD_POD_LABEL + value: app=nextcloud + - name: NEXTCLOUD_CONTAINER + value: nextcloud + - name: NEXTCLOUD_EXEC_TIMEOUT_SEC + value: "120" + - name: NEXTCLOUD_URL + value: https://cloud.bstein.dev + - name: NEXTCLOUD_DB_HOST + value: postgres-service.postgres.svc.cluster.local + - name: NEXTCLOUD_DB_PORT + value: "5432" - name: WGER_NAMESPACE value: health - - name: WGER_USER_SYNC_CRONJOB - value: wger-user-sync - - name: WGER_ADMIN_CRONJOB - value: wger-admin-ensure - name: WGER_USER_SYNC_WAIT_TIMEOUT_SEC value: "90" + - name: WGER_POD_LABEL + value: app=wger + - name: WGER_CONTAINER + value: wger + - name: WGER_ADMIN_EMAIL + value: brad@bstein.dev - name: FIREFLY_NAMESPACE value: finance - - name: FIREFLY_USER_SYNC_CRONJOB - value: firefly-user-sync - name: FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC value: "90" + - name: FIREFLY_POD_LABEL + value: app=firefly + - name: FIREFLY_CONTAINER + value: firefly + - name: FIREFLY_CRON_BASE_URL + value: http://firefly.finance.svc.cluster.local/api/v1/cron + - name: FIREFLY_CRON_TIMEOUT_SEC + value: "30" - name: VAULT_NAMESPACE value: vault - - name: VAULT_K8S_AUTH_CRONJOB - value: vault-k8s-auth-config - - name: VAULT_OIDC_CRONJOB - value: vault-oidc-config - - name: VAULT_JOB_WAIT_TIMEOUT_SEC - value: "120" + - name: VAULT_ADDR + value: http://vault.vault.svc.cluster.local:8200 + - name: VAULT_K8S_ROLE + value: vault-admin + - name: VAULT_K8S_ROLE_TTL + value: 1h - name: COMMS_NAMESPACE value: comms - - name: COMMS_GUEST_NAME_CRONJOB - value: guest-name-randomizer - - name: COMMS_PIN_INVITE_CRONJOB - value: pin-othrys-invite - - name: COMMS_RESET_ROOM_CRONJOB - value: othrys-room-reset - - name: COMMS_SEED_ROOM_CRONJOB - value: seed-othrys-room - - name: COMMS_JOB_WAIT_TIMEOUT_SEC - value: "60" + - name: COMMS_SYNAPSE_BASE + value: http://othrys-synapse-matrix-synapse:8008 + - name: COMMS_AUTH_BASE + value: http://matrix-authentication-service:8080 + - name: COMMS_MAS_ADMIN_API_BASE + value: http://matrix-authentication-service:8081/api/admin/v1 + - name: COMMS_MAS_TOKEN_URL + value: http://matrix-authentication-service:8080/oauth2/token + - name: COMMS_MAS_ADMIN_CLIENT_ID + value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM + - name: COMMS_SERVER_NAME + value: live.bstein.dev + - name: COMMS_ROOM_ALIAS + value: "#othrys:live.bstein.dev" + - name: COMMS_ROOM_NAME + value: Othrys + - name: COMMS_PIN_MESSAGE + value: "Invite guests: share https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join and choose 'Continue' -> 'Join as guest'." + - name: COMMS_SEEDER_USER + value: othrys-seeder + - name: COMMS_BOT_USER + value: atlasbot + - name: COMMS_SYNAPSE_DB_HOST + value: postgres-service.postgres.svc.cluster.local + - name: COMMS_SYNAPSE_DB_PORT + value: "5432" + - name: COMMS_SYNAPSE_DB_NAME + value: synapse + - name: COMMS_SYNAPSE_DB_USER + value: synapse + - name: COMMS_TIMEOUT_SEC + value: "30" + - name: COMMS_GUEST_STALE_DAYS + value: "14" - name: VAULTWARDEN_NAMESPACE value: vaultwarden - name: VAULTWARDEN_POD_LABEL @@ -172,10 +257,22 @@ spec: value: "30 4 * * *" - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC value: "0 5 * * *" + - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON + value: "*/5 * * * *" + - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE + value: "30 4 * * *" - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_WGER_ADMIN value: "15 3 * * *" + - name: ARIADNE_SCHEDULE_FIREFLY_CRON + value: "0 3 * * *" + - name: ARIADNE_SCHEDULE_POD_CLEANER + value: "0 * * * *" + - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE + value: "23 3 * * *" + - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER + value: "30 4 * * 0" - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_VAULT_OIDC @@ -192,6 +289,12 @@ spec: value: "true" - name: K8S_API_TIMEOUT_SEC value: "5" + - name: OPENSEARCH_URL + value: http://opensearch-master.logging.svc.cluster.local:9200 + - name: OPENSEARCH_LIMIT_BYTES + value: "1099511627776" + - name: OPENSEARCH_INDEX_PATTERNS + value: kube-*,journald-*,trace-analytics-* - name: METRICS_PATH value: "/metrics" resources: diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml index 8d2a2a9a..e2f08c9b 100644 --- a/services/maintenance/ariadne-rbac.yaml +++ b/services/maintenance/ariadne-rbac.yaml @@ -6,13 +6,25 @@ metadata: rules: - apiGroups: ["batch"] resources: - - cronjobs - jobs verbs: - get - list - watch - create + - apiGroups: [""] + resources: + - pods + verbs: + - get + - list + - watch + - delete + - apiGroups: [""] + resources: + - pods/exec + verbs: + - create --- apiVersion: rbac.authorization.k8s.io/v1 diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index c9c0c9ab..b76f9095 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -471,6 +471,119 @@ } } ] + }, + { + "id": 10, + "type": "stat", + "title": "Ariadne CI Coverage (%)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 22 + }, + "targets": [ + { + "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}", + "refId": "A", + "legendFormat": "{{branch}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 11, + "type": "table", + "title": "Ariadne CI Tests (latest)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 18, + "x": 6, + "y": 22 + }, + "targets": [ + { + "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] } ], "time": { diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 7746f165..09c29a4c 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -480,6 +480,119 @@ data: } } ] + }, + { + "id": 10, + "type": "stat", + "title": "Ariadne CI Coverage (%)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 22 + }, + "targets": [ + { + "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}", + "refId": "A", + "legendFormat": "{{branch}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 11, + "type": "table", + "title": "Ariadne CI Tests (latest)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 18, + "x": 6, + "y": 22 + }, + "targets": [ + { + "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] } ], "time": { diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index a5ccb61d..c14c5ec6 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ - "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" "" + "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ From c804ec040c5fcf91328e0686854aef9eba0d3e50 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 02:57:40 -0300 Subject: [PATCH 061/416] glue: centralize sync tasks in ariadne --- .../cert-manager/letsencrypt-prod.yaml | 2 +- .../sources/cert-manager/letsencrypt.yaml | 2 +- scripts/dashboards_render_atlas.py | 35 +++++- services/finance/firefly-cronjob.yaml | 1 + services/keycloak/deployment.yaml | 2 +- services/keycloak/realm-settings-job.yaml | 73 +++++++++++++ .../logging/opensearch-prune-cronjob.yaml | 1 + services/mailu/kustomization.yaml | 5 - services/maintenance/ariadne-deployment.yaml | 12 +- .../maintenance/image-sweeper-cronjob.yaml | 1 + services/maintenance/pod-cleaner-cronjob.yaml | 1 + .../monitoring/dashboards/atlas-testing.json | 103 ++++++++++++++++-- .../monitoring/grafana-dashboard-testing.yaml | 103 ++++++++++++++++-- services/nextcloud/cronjob.yaml | 1 + services/nextcloud/maintenance-cronjob.yaml | 1 + 15 files changed, 313 insertions(+), 30 deletions(-) diff --git a/infrastructure/sources/cert-manager/letsencrypt-prod.yaml b/infrastructure/sources/cert-manager/letsencrypt-prod.yaml index 7f90f01a..5795b091 100644 --- a/infrastructure/sources/cert-manager/letsencrypt-prod.yaml +++ b/infrastructure/sources/cert-manager/letsencrypt-prod.yaml @@ -5,7 +5,7 @@ metadata: name: letsencrypt-prod spec: acme: - email: brad.stein@gmail.com + email: brad@bstein.dev server: https://acme-v02.api.letsencrypt.org/directory privateKeySecretRef: name: letsencrypt-prod-account-key diff --git a/infrastructure/sources/cert-manager/letsencrypt.yaml b/infrastructure/sources/cert-manager/letsencrypt.yaml index a988312c..5fbe4e36 100644 --- a/infrastructure/sources/cert-manager/letsencrypt.yaml +++ b/infrastructure/sources/cert-manager/letsencrypt.yaml @@ -5,7 +5,7 @@ metadata: name: letsencrypt spec: acme: - email: brad.stein@gmail.com + email: brad@bstein.dev server: https://acme-v02.api.letsencrypt.org/directory privateKeySecretRef: name: letsencrypt-account-key diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index a3fb3727..509cf493 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -338,7 +338,9 @@ GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' +ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))' ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" +ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' @@ -2236,12 +2238,24 @@ def build_testing_dashboard(): instant=True, ) ) + panels.append( + timeseries_panel( + 12, + "Ariadne Task Runs vs Errors (1h)", + ARIADNE_TASK_RUNS_BY_STATUS_1H, + {"h": 6, "w": 24, "x": 0, "y": 12}, + unit="none", + legend="{{status}}", + legend_display="table", + legend_placement="right", + ) + ) panels.append( table_panel( 7, "Ariadne Task Errors (24h)", ARIADNE_TASK_ERRORS_24H, - {"h": 6, "w": 12, "x": 0, "y": 12}, + {"h": 6, "w": 12, "x": 0, "y": 18}, unit="none", transformations=sort_desc, instant=True, @@ -2252,7 +2266,7 @@ def build_testing_dashboard(): 8, "Ariadne Schedule Last Success (hours ago)", ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, - {"h": 6, "w": 12, "x": 12, "y": 12}, + {"h": 6, "w": 12, "x": 12, "y": 18}, unit="h", transformations=sort_desc, instant=True, @@ -2263,18 +2277,29 @@ def build_testing_dashboard(): 9, "Ariadne Access Requests", ARIADNE_ACCESS_REQUESTS, - {"h": 4, "w": 24, "x": 0, "y": 18}, + {"h": 6, "w": 12, "x": 12, "y": 24}, unit="none", transformations=sort_desc, instant=True, ) ) + panels.append( + table_panel( + 13, + "Ariadne Schedule Last Error (hours ago)", + ARIADNE_SCHEDULE_LAST_ERROR_HOURS, + {"h": 6, "w": 12, "x": 0, "y": 24}, + unit="h", + transformations=sort_desc, + instant=True, + ) + ) panels.append( stat_panel( 10, "Ariadne CI Coverage (%)", ARIADNE_CI_COVERAGE, - {"h": 4, "w": 6, "x": 0, "y": 22}, + {"h": 4, "w": 6, "x": 0, "y": 30}, unit="percent", decimals=1, instant=True, @@ -2286,7 +2311,7 @@ def build_testing_dashboard(): 11, "Ariadne CI Tests (latest)", ARIADNE_CI_TESTS, - {"h": 6, "w": 18, "x": 6, "y": 22}, + {"h": 6, "w": 18, "x": 6, "y": 30}, unit="none", transformations=sort_desc, instant=True, diff --git a/services/finance/firefly-cronjob.yaml b/services/finance/firefly-cronjob.yaml index 6c4d5072..9e5c8522 100644 --- a/services/finance/firefly-cronjob.yaml +++ b/services/finance/firefly-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: finance spec: schedule: "0 3 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/keycloak/deployment.yaml b/services/keycloak/deployment.yaml index 3d241c98..131169db 100644 --- a/services/keycloak/deployment.yaml +++ b/services/keycloak/deployment.yaml @@ -126,7 +126,7 @@ spec: - name: KC_EVENTS_LISTENERS value: jboss-logging,mailu-http - name: KC_SPI_EVENTS_LISTENER_MAILU-HTTP_ENDPOINT - value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events + value: http://ariadne.maintenance.svc.cluster.local/events ports: - containerPort: 8080 name: http diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index fdee377c..786948be 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -469,6 +469,79 @@ spec: if status not in (201, 204): raise SystemExit(f"Unexpected protocol mapper create response: {status}") + # Ensure mailu_email overrides email claim for service clients. + excluded_email_clients = { + "account", + "account-console", + "admin-cli", + "security-admin-console", + "realm-management", + "broker", + } + status, clients = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients", + access_token, + ) + if status == 200 and isinstance(clients, list): + for client in clients: + if not isinstance(client, dict): + continue + if client.get("protocol") != "openid-connect": + continue + client_name = client.get("clientId") if isinstance(client.get("clientId"), str) else "" + if not client_name or client_name in excluded_email_clients: + continue + client_id = client.get("id") + if not client_id: + continue + email_mapper = { + "name": "mailu-email", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": False, + "config": { + "user.attribute": "mailu_email", + "claim.name": "email", + "jsonType.label": "String", + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true", + "multivalued": "false", + "aggregate.attrs": "false", + }, + } + status, mappers = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + ) + existing = None + if status == 200 and isinstance(mappers, list): + for item in mappers: + if isinstance(item, dict) and item.get("name") == email_mapper["name"]: + existing = item + break + if existing and existing.get("id"): + email_mapper["id"] = existing["id"] + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models/{existing['id']}", + access_token, + email_mapper, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected mailu email mapper update response: {status}") + else: + status, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + email_mapper, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected mailu email mapper create response: {status}") + # Ensure MFA is on by default for newly-created users. status, required_actions = http_json( "GET", diff --git a/services/logging/opensearch-prune-cronjob.yaml b/services/logging/opensearch-prune-cronjob.yaml index 75e72dbd..dc0dffb2 100644 --- a/services/logging/opensearch-prune-cronjob.yaml +++ b/services/logging/opensearch-prune-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: logging spec: schedule: "23 3 * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/mailu/kustomization.yaml b/services/mailu/kustomization.yaml index 5c111eb6..7447f24a 100644 --- a/services/mailu/kustomization.yaml +++ b/services/mailu/kustomization.yaml @@ -15,7 +15,6 @@ resources: - ingressroute.yaml - mailu-sync-job.yaml - mailu-sync-cronjob.yaml - - mailu-sync-listener.yaml - front-lb.yaml configMapGenerator: @@ -31,10 +30,6 @@ configMapGenerator: - sync.py=scripts/mailu_sync.py options: disableNameSuffixHash: true - - name: mailu-sync-listener - namespace: mailu-mailserver - files: - - listener.py=scripts/mailu_sync_listener.py - name: mailu-vault-entrypoint namespace: mailu-mailserver files: diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 57ce72b7..57862abb 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -23,6 +23,7 @@ spec: vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | {{ with secret "kv/data/atlas/maintenance/ariadne-db" }} + export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}" export PORTAL_DATABASE_URL="{{ .Data.data.database_url }}" {{ end }} {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }} @@ -57,6 +58,7 @@ spec: export SMTP_USERNAME="no-reply-portal@bstein.dev" export SMTP_PASSWORD="{{ .Data.data.password }}" export SMTP_FROM="no-reply-portal@bstein.dev" + export MAILU_SYSTEM_PASSWORD="{{ .Data.data.password }}" {{ end }} {{ with secret "kv/data/atlas/comms/mas-admin-client-runtime" }} export COMMS_MAS_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" @@ -140,7 +142,11 @@ spec: - name: MAILU_HOST value: mail.bstein.dev - name: MAILU_SYNC_URL - value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events + value: http://ariadne.maintenance.svc.cluster.local/events + - name: MAILU_EVENT_MIN_INTERVAL_SEC + value: "10" + - name: MAILU_SYSTEM_USERS + value: no-reply-portal@bstein.dev,no-reply-vaultwarden@bstein.dev - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC value: "180" - name: MAILU_DB_HOST @@ -263,8 +269,12 @@ spec: value: "30 4 * * *" - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC value: "*/15 * * * *" + - name: ARIADNE_SCHEDULE_WGER_USER_SYNC + value: "0 5 * * *" - name: ARIADNE_SCHEDULE_WGER_ADMIN value: "15 3 * * *" + - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC + value: "0 6 * * *" - name: ARIADNE_SCHEDULE_FIREFLY_CRON value: "0 3 * * *" - name: ARIADNE_SCHEDULE_POD_CLEANER diff --git a/services/maintenance/image-sweeper-cronjob.yaml b/services/maintenance/image-sweeper-cronjob.yaml index c94fcca6..00392060 100644 --- a/services/maintenance/image-sweeper-cronjob.yaml +++ b/services/maintenance/image-sweeper-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: maintenance spec: schedule: "30 4 * * 0" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 2 failedJobsHistoryLimit: 2 diff --git a/services/maintenance/pod-cleaner-cronjob.yaml b/services/maintenance/pod-cleaner-cronjob.yaml index e083c85f..99d13f67 100644 --- a/services/maintenance/pod-cleaner-cronjob.yaml +++ b/services/maintenance/pod-cleaner-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: maintenance spec: schedule: "0 * * * *" + suspend: true concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index b76f9095..207077ef 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -322,6 +322,43 @@ } ] }, + { + "id": 12, + "type": "timeseries", + "title": "Ariadne Task Runs vs Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))", + "refId": "A", + "legendFormat": "{{status}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, { "id": 7, "type": "table", @@ -334,7 +371,7 @@ "h": 6, "w": 12, "x": 0, - "y": 12 + "y": 18 }, "targets": [ { @@ -384,7 +421,7 @@ "h": 6, "w": 12, "x": 12, - "y": 12 + "y": 18 }, "targets": [ { @@ -431,10 +468,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 4, - "w": 24, - "x": 0, - "y": 18 + "h": 6, + "w": 12, + "x": 12, + "y": 24 }, "targets": [ { @@ -472,6 +509,56 @@ } ] }, + { + "id": 13, + "type": "table", + "title": "Ariadne Schedule Last Error (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 24 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 10, "type": "stat", @@ -484,7 +571,7 @@ "h": 4, "w": 6, "x": 0, - "y": 22 + "y": 30 }, "targets": [ { @@ -547,7 +634,7 @@ "h": 6, "w": 18, "x": 6, - "y": 22 + "y": 30 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 09c29a4c..362751bb 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -331,6 +331,43 @@ data: } ] }, + { + "id": 12, + "type": "timeseries", + "title": "Ariadne Task Runs vs Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))", + "refId": "A", + "legendFormat": "{{status}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, { "id": 7, "type": "table", @@ -343,7 +380,7 @@ data: "h": 6, "w": 12, "x": 0, - "y": 12 + "y": 18 }, "targets": [ { @@ -393,7 +430,7 @@ data: "h": 6, "w": 12, "x": 12, - "y": 12 + "y": 18 }, "targets": [ { @@ -440,10 +477,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 4, - "w": 24, - "x": 0, - "y": 18 + "h": 6, + "w": 12, + "x": 12, + "y": 24 }, "targets": [ { @@ -481,6 +518,56 @@ data: } ] }, + { + "id": 13, + "type": "table", + "title": "Ariadne Schedule Last Error (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 24 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 10, "type": "stat", @@ -493,7 +580,7 @@ data: "h": 4, "w": 6, "x": 0, - "y": 22 + "y": 30 }, "targets": [ { @@ -556,7 +643,7 @@ data: "h": 6, "w": 18, "x": 6, - "y": 22 + "y": 30 }, "targets": [ { diff --git a/services/nextcloud/cronjob.yaml b/services/nextcloud/cronjob.yaml index cc0091bc..58d8aa1b 100644 --- a/services/nextcloud/cronjob.yaml +++ b/services/nextcloud/cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: nextcloud spec: schedule: "*/5 * * * *" + suspend: true concurrencyPolicy: Forbid jobTemplate: spec: diff --git a/services/nextcloud/maintenance-cronjob.yaml b/services/nextcloud/maintenance-cronjob.yaml index d4008c7c..177cc022 100644 --- a/services/nextcloud/maintenance-cronjob.yaml +++ b/services/nextcloud/maintenance-cronjob.yaml @@ -6,6 +6,7 @@ metadata: namespace: nextcloud spec: schedule: "30 4 * * *" + suspend: true concurrencyPolicy: Forbid jobTemplate: spec: From 18a086ce9528ae664f967ac65718cfceecacf986 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 03:03:32 -0300 Subject: [PATCH 062/416] keycloak: bump realm settings job name --- services/keycloak/realm-settings-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index 786948be..6e6589de 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: keycloak-realm-settings-33 + name: keycloak-realm-settings-34 namespace: sso spec: backoffLimit: 0 From ec36cd21e3a36f01880161570d11903496c5e694 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 03:05:53 -0300 Subject: [PATCH 063/416] rbac: allow ariadne to read cronjobs --- services/maintenance/ariadne-rbac.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml index e2f08c9b..8a063bf3 100644 --- a/services/maintenance/ariadne-rbac.yaml +++ b/services/maintenance/ariadne-rbac.yaml @@ -7,6 +7,7 @@ rules: - apiGroups: ["batch"] resources: - jobs + - cronjobs verbs: - get - list From 0680926dae1c938121c2f401e9b748dddeadad90 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 03:21:01 -0300 Subject: [PATCH 064/416] vault: allow ariadne to read needed secrets --- services/vault/scripts/vault_k8s_auth_configure.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index c14c5ec6..2fce3f4e 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ - "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" "" + "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ From 0ab34c0af5f4c0bd66686e738fb8166e7faafbe4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 03:39:17 -0300 Subject: [PATCH 065/416] ariadne: split portal and ariadne db secrets --- services/maintenance/ariadne-deployment.yaml | 4 +++- services/vault/scripts/vault_k8s_auth_configure.sh | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 57862abb..bb9766f9 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -24,7 +24,9 @@ spec: vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | {{ with secret "kv/data/atlas/maintenance/ariadne-db" }} export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}" - export PORTAL_DATABASE_URL="{{ .Data.data.database_url }}" + {{ end }} + {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" {{ end }} {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }} export KEYCLOAK_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}" diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index 2fce3f4e..bc03cf4c 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ - "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" "" + "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ From f7f549e536d46736bb200a9f91132213459deeec Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 03:53:34 -0300 Subject: [PATCH 066/416] maintenance: bump ariadne image tag --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index a86453e1..fd544410 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-6 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-10 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From 3d807570568c6f1314bb171821ad701098798ddd Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 04:05:41 -0300 Subject: [PATCH 067/416] maintenance: fix ariadne comms endpoints and exec RBAC --- services/maintenance/ariadne-deployment.yaml | 8 ++++---- services/maintenance/ariadne-rbac.yaml | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index bb9766f9..069f3885 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -202,13 +202,13 @@ spec: - name: COMMS_NAMESPACE value: comms - name: COMMS_SYNAPSE_BASE - value: http://othrys-synapse-matrix-synapse:8008 + value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008 - name: COMMS_AUTH_BASE - value: http://matrix-authentication-service:8080 + value: http://matrix-authentication-service.comms.svc.cluster.local:8080 - name: COMMS_MAS_ADMIN_API_BASE - value: http://matrix-authentication-service:8081/api/admin/v1 + value: http://matrix-authentication-service.comms.svc.cluster.local:8081/api/admin/v1 - name: COMMS_MAS_TOKEN_URL - value: http://matrix-authentication-service:8080/oauth2/token + value: http://matrix-authentication-service.comms.svc.cluster.local:8080/oauth2/token - name: COMMS_MAS_ADMIN_CLIENT_ID value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM - name: COMMS_SERVER_NAME diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml index 8a063bf3..88689cb6 100644 --- a/services/maintenance/ariadne-rbac.yaml +++ b/services/maintenance/ariadne-rbac.yaml @@ -25,6 +25,7 @@ rules: resources: - pods/exec verbs: + - get - create --- From 152c660b0a73e5c4465aaf8842d2476025af6b13 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 05:03:26 -0300 Subject: [PATCH 068/416] maintenance: bump ariadne image tag --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index fd544410..a53ffee5 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-10 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne"} configMapGenerator: - name: disable-k3s-traefik-script From 94ad57e5a56de031f586e24af0f494e5bc9f5660 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 10:20:53 -0300 Subject: [PATCH 069/416] flux: align imagepolicy tag setters --- services/bstein-dev-home/kustomization.yaml | 4 ++-- services/maintenance/kustomization.yaml | 2 +- services/pegasus/deployment.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index ec137dc6..26840ab7 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,9 +20,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"} + newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"} + newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index a53ffee5..daee5f14 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne"} + newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script diff --git a/services/pegasus/deployment.yaml b/services/pegasus/deployment.yaml index bc3db70a..b6a1639e 100644 --- a/services/pegasus/deployment.yaml +++ b/services/pegasus/deployment.yaml @@ -72,7 +72,7 @@ spec: containers: - name: pegasus - image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus"} + image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus:tag"} imagePullPolicy: Always env: - name: PEGASUS_MEDIA_ROOT From 6a0872259b9f063e994829ea93a41af9f5beae0d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 10:33:06 -0300 Subject: [PATCH 070/416] flux: align image automation namespaces --- .../applications/bstein-dev-home/image-automation.yaml | 2 +- .../flux-system/applications/pegasus/image-automation.yaml | 2 +- .../flux-system/platform/maintenance/image-automation.yaml | 2 +- services/pegasus/kustomization.yaml | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml index 643d4792..10d79132 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml @@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1 kind: ImageUpdateAutomation metadata: name: bstein-dev-home - namespace: flux-system + namespace: bstein-dev-home spec: interval: 1m0s sourceRef: diff --git a/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml b/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml index ec0494e5..d11422a8 100644 --- a/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml @@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1 kind: ImageUpdateAutomation metadata: name: pegasus - namespace: flux-system + namespace: jellyfin spec: interval: 1m0s sourceRef: diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml index 867cae48..9f3214b5 100644 --- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml +++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml @@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1 kind: ImageUpdateAutomation metadata: name: maintenance - namespace: flux-system + namespace: maintenance spec: interval: 1m0s sourceRef: diff --git a/services/pegasus/kustomization.yaml b/services/pegasus/kustomization.yaml index bef2b405..05c3baa5 100644 --- a/services/pegasus/kustomization.yaml +++ b/services/pegasus/kustomization.yaml @@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - configmap.yaml + - image.yaml - vault-serviceaccount.yaml - secretproviderclass.yaml - service.yaml From d033d680a31f2b8215a508e6f5384365a8a175be Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 10:34:25 -0300 Subject: [PATCH 071/416] flux: fix image automation templates --- .../applications/bstein-dev-home/image-automation.yaml | 2 +- .../flux-system/platform/maintenance/image-automation.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml index 10d79132..8b2900c1 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml @@ -18,7 +18,7 @@ spec: author: email: ops@bstein.dev name: flux-bot - messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}" + messageTemplate: "chore(bstein-dev-home): update images to {{range .Changed.Images}}{{.}}{{end}}" push: branch: feature/ariadne update: diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml index 9f3214b5..48e4c309 100644 --- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml +++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml @@ -18,7 +18,7 @@ spec: author: email: ops@bstein.dev name: flux-bot - messageTemplate: "chore(maintenance): update images to {{range .Updated.Images}}{{.}}{{end}}" + messageTemplate: "chore(maintenance): update images to {{range .Changed.Images}}{{.}}{{end}}" push: branch: feature/ariadne update: From 1aadaf59ffadd820599c12e9f7fca126151b5f3d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 10:35:29 -0300 Subject: [PATCH 072/416] flux: simplify image automation messages --- .../applications/bstein-dev-home/image-automation.yaml | 2 +- .../flux-system/platform/maintenance/image-automation.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml index 8b2900c1..f1d41be3 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml @@ -18,7 +18,7 @@ spec: author: email: ops@bstein.dev name: flux-bot - messageTemplate: "chore(bstein-dev-home): update images to {{range .Changed.Images}}{{.}}{{end}}" + messageTemplate: "chore(bstein-dev-home): automated image update" push: branch: feature/ariadne update: diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml index 48e4c309..6e8f612c 100644 --- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml +++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml @@ -18,7 +18,7 @@ spec: author: email: ops@bstein.dev name: flux-bot - messageTemplate: "chore(maintenance): update images to {{range .Changed.Images}}{{.}}{{end}}" + messageTemplate: "chore(maintenance): automated image update" push: branch: feature/ariadne update: From 0ed261a5df5ee30cf54e248eb3b56df20f3df7a5 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 13:35:55 +0000 Subject: [PATCH 073/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index daee5f14..088ce488 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -23,11 +23,9 @@ resources: - node-image-sweeper-serviceaccount.yaml - node-image-sweeper-daemonset.yaml - image-sweeper-cronjob.yaml - images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne:tag"} - + newTag: 0.1.0-15 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 3ffb1b8a209e3e5d0891866c8eaeeee87c5493ff Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 13:36:39 +0000 Subject: [PATCH 074/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 26840ab7..78f5e685 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,9 +20,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From b05a76eb0795dffffce2a2cf99041340a3699be8 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 14:04:54 +0000 Subject: [PATCH 075/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 088ce488..05f3be2f 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-15 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-16 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From aaeb933625e707675a249814aaa17a2999a8c63b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 11:29:29 -0300 Subject: [PATCH 076/416] monitoring: refresh testing dashboard --- .gitignore | 1 + scripts/dashboards_render_atlas.py | 200 ++-- .../monitoring/dashboards/atlas-testing.json | 896 ++++++++++++------ .../monitoring/grafana-dashboard-testing.yaml | 896 ++++++++++++------ 4 files changed, 1401 insertions(+), 592 deletions(-) diff --git a/.gitignore b/.gitignore index 8d0ab1e9..7543bbfb 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ __pycache__/ *.py[cod] .pytest_cache .venv +.venv-ci tmp/ diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 509cf493..6eaafb46 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -339,6 +339,9 @@ GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))' +ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' +ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))' +ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))' ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" @@ -696,8 +699,10 @@ def bargauge_panel( grid, *, unit="none", + legend=None, links=None, limit=None, + sort_order="desc", thresholds=None, decimals=None, instant=False, @@ -710,7 +715,12 @@ def bargauge_panel( "datasource": PROM_DS, "gridPos": grid, "targets": [ - {"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})} + { + "expr": expr, + "refId": "A", + "legendFormat": legend or "{{node}}", + **({"instant": True} if instant else {}), + } ], "fieldConfig": { "defaults": { @@ -748,7 +758,7 @@ def bargauge_panel( panel["transformations"] = [ { "id": "sortBy", - "options": {"fields": ["Value"], "order": "desc"}, + "options": {"fields": ["Value"], "order": sort_order}, } ] if limit: @@ -2163,7 +2173,24 @@ def build_mail_dashboard(): def build_testing_dashboard(): panels = [] - sort_desc = [{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}] + age_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 6}, + {"color": "orange", "value": 24}, + {"color": "red", "value": 48}, + ], + } + recent_error_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 1}, + {"color": "yellow", "value": 6}, + {"color": "green", "value": 24}, + ], + } panels.append( stat_panel( @@ -2184,66 +2211,56 @@ def build_testing_dashboard(): ) ) panels.append( - table_panel( + stat_panel( 2, "Glue Jobs Missing Success", - GLUE_MISSING_ACTIVE, - {"h": 4, "w": 6, "x": 6, "y": 0}, + GLUE_MISSING_COUNT, + {"h": 4, "w": 4, "x": 4, "y": 0}, unit="none", - transformations=sort_desc, - instant=True, ) ) panels.append( - table_panel( + stat_panel( 3, "Glue Jobs Suspended", - GLUE_SUSPENDED, - {"h": 4, "w": 6, "x": 12, "y": 0}, + GLUE_SUSPENDED_COUNT, + {"h": 4, "w": 4, "x": 8, "y": 0}, unit="none", - transformations=sort_desc, - instant=True, ) ) panels.append( - table_panel( + stat_panel( 4, - "Glue Jobs Active Runs", - GLUE_ACTIVE, - {"h": 4, "w": 6, "x": 18, "y": 0}, + "Ariadne Task Errors (1h)", + ARIADNE_TASK_ERRORS_1H_TOTAL, + {"h": 4, "w": 4, "x": 12, "y": 0}, unit="none", - transformations=sort_desc, - instant=True, ) ) panels.append( - table_panel( + stat_panel( 5, - "Glue Jobs Last Success (hours ago)", - GLUE_LAST_SUCCESS_AGE_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 4}, - unit="h", - transformations=sort_desc, - instant=True, + "Ariadne Task Errors (24h)", + ARIADNE_TASK_ERRORS_24H_TOTAL, + {"h": 4, "w": 4, "x": 16, "y": 0}, + unit="none", ) ) panels.append( - table_panel( + stat_panel( 6, - "Glue Jobs Last Schedule (hours ago)", - GLUE_LAST_SCHEDULE_AGE_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 4}, - unit="h", - transformations=sort_desc, - instant=True, + "Ariadne Task Runs (1h)", + ARIADNE_TASK_RUNS_1H_TOTAL, + {"h": 4, "w": 4, "x": 20, "y": 0}, + unit="none", ) ) panels.append( timeseries_panel( - 12, + 7, "Ariadne Task Runs vs Errors (1h)", ARIADNE_TASK_RUNS_BY_STATUS_1H, - {"h": 6, "w": 24, "x": 0, "y": 12}, + {"h": 6, "w": 24, "x": 0, "y": 4}, unit="none", legend="{{status}}", legend_display="table", @@ -2251,55 +2268,110 @@ def build_testing_dashboard(): ) ) panels.append( - table_panel( - 7, + bargauge_panel( + 8, "Ariadne Task Errors (24h)", ARIADNE_TASK_ERRORS_24H, - {"h": 6, "w": 12, "x": 0, "y": 18}, + {"h": 8, "w": 12, "x": 0, "y": 10}, unit="none", - transformations=sort_desc, instant=True, + legend="{{task}}", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 3}, + {"color": "red", "value": 5}, + ], + }, ) ) panels.append( - table_panel( - 8, - "Ariadne Schedule Last Success (hours ago)", - ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, - {"h": 6, "w": 12, "x": 12, "y": 18}, - unit="h", - transformations=sort_desc, - instant=True, - ) - ) - panels.append( - table_panel( + bargauge_panel( 9, - "Ariadne Access Requests", - ARIADNE_ACCESS_REQUESTS, - {"h": 6, "w": 12, "x": 12, "y": 24}, + "Ariadne Task Success (24h)", + ARIADNE_TASK_SUCCESS_24H, + {"h": 8, "w": 12, "x": 12, "y": 10}, unit="none", - transformations=sort_desc, instant=True, + legend="{{task}}", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 1}, + {"color": "yellow", "value": 5}, + {"color": "green", "value": 10}, + ], + }, ) ) panels.append( - table_panel( - 13, + bargauge_panel( + 10, "Ariadne Schedule Last Error (hours ago)", ARIADNE_SCHEDULE_LAST_ERROR_HOURS, - {"h": 6, "w": 12, "x": 0, "y": 24}, + {"h": 8, "w": 12, "x": 0, "y": 18}, unit="h", - transformations=sort_desc, instant=True, + legend="{{task}}", + thresholds=recent_error_thresholds, + ) + ) + panels.append( + bargauge_panel( + 11, + "Ariadne Schedule Last Success (hours ago)", + ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, + {"h": 8, "w": 12, "x": 12, "y": 18}, + unit="h", + instant=True, + legend="{{task}}", + thresholds=age_thresholds, + ) + ) + panels.append( + bargauge_panel( + 12, + "Glue Jobs Last Success (hours ago)", + GLUE_LAST_SUCCESS_AGE_HOURS, + {"h": 8, "w": 12, "x": 0, "y": 26}, + unit="h", + instant=True, + legend="{{namespace}}/{{cronjob}}", + thresholds=age_thresholds, + ) + ) + panels.append( + bargauge_panel( + 13, + "Glue Jobs Last Schedule (hours ago)", + GLUE_LAST_SCHEDULE_AGE_HOURS, + {"h": 8, "w": 12, "x": 12, "y": 26}, + unit="h", + instant=True, + legend="{{namespace}}/{{cronjob}}", + thresholds=age_thresholds, + ) + ) + panels.append( + bargauge_panel( + 14, + "Ariadne Access Requests", + ARIADNE_ACCESS_REQUESTS, + {"h": 6, "w": 8, "x": 0, "y": 34}, + unit="none", + instant=True, + legend="{{status}}", ) ) panels.append( stat_panel( - 10, + 15, "Ariadne CI Coverage (%)", ARIADNE_CI_COVERAGE, - {"h": 4, "w": 6, "x": 0, "y": 30}, + {"h": 6, "w": 4, "x": 8, "y": 34}, unit="percent", decimals=1, instant=True, @@ -2308,12 +2380,12 @@ def build_testing_dashboard(): ) panels.append( table_panel( - 11, + 16, "Ariadne CI Tests (latest)", ARIADNE_CI_TESTS, - {"h": 6, "w": 18, "x": 6, "y": 30}, + {"h": 6, "w": 12, "x": 12, "y": 34}, unit="none", - transformations=sort_desc, + transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], instant=True, ) ) diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index 207077ef..420abf26 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -74,7 +74,7 @@ }, { "id": 2, - "type": "table", + "type": "stat", "title": "Glue Jobs Missing Success", "datasource": { "type": "prometheus", @@ -82,49 +82,59 @@ }, "gridPos": { "h": 4, - "w": 6, - "x": 6, + "w": 4, + "x": 4, "y": 0 }, "targets": [ { - "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A", - "instant": true + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 3, - "type": "table", + "type": "stat", "title": "Glue Jobs Suspended", "datasource": { "type": "prometheus", @@ -132,198 +142,238 @@ }, "gridPos": { "h": 4, - "w": 6, - "x": 12, + "w": 4, + "x": 8, "y": 0 }, "targets": [ { - "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1", - "refId": "A", - "instant": true + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 4, - "type": "table", - "title": "Glue Jobs Active Runs", + "type": "stat", + "title": "Ariadne Task Errors (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, - "w": 6, - "x": 18, + "w": 4, + "x": 12, "y": 0 }, "targets": [ { - "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 5, - "type": "table", - "title": "Glue Jobs Last Success (hours ago)", + "type": "stat", + "title": "Ariadne Task Errors (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 + "h": 4, + "w": 4, + "x": 16, + "y": 0 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "h", + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 6, - "type": "table", - "title": "Glue Jobs Last Schedule (hours ago)", + "type": "stat", + "title": "Ariadne Task Runs (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 + "h": 4, + "w": 4, + "x": 20, + "y": 0 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "h", + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { - "id": 12, + "id": 7, "type": "timeseries", "title": "Ariadne Task Runs vs Errors (1h)", "datasource": { @@ -334,7 +384,7 @@ "h": 6, "w": 24, "x": 0, - "y": 12 + "y": 4 }, "targets": [ { @@ -360,94 +410,68 @@ } }, { - "id": 7, - "type": "table", + "id": 8, + "type": "bargauge", "title": "Ariadne Task Errors (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 10 }, "targets": [ { "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", "refId": "A", + "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", - "custom": { - "filterable": true + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 8, - "type": "table", - "title": "Ariadne Schedule Last Success (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 18 - }, - "targets": [ - { - "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, { "id": "sortBy", "options": { @@ -461,93 +485,67 @@ }, { "id": 9, - "type": "table", - "title": "Ariadne Access Requests", + "type": "bargauge", + "title": "Ariadne Task Success (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 8, "w": 12, "x": 12, - "y": 24 + "y": 10 }, "targets": [ { - "expr": "ariadne_access_requests_total", + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))", "refId": "A", + "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", - "custom": { - "filterable": true + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "green", + "value": 10 + } + ] } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 13, - "type": "table", - "title": "Ariadne Schedule Last Error (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 24 - }, - "targets": [ - { - "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, { "id": "sortBy", "options": { @@ -561,6 +559,376 @@ }, { "id": 10, + "type": "bargauge", + "title": "Ariadne Schedule Last Error (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "green", + "value": 24 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 11, + "type": "bargauge", + "title": "Ariadne Schedule Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 12, + "type": "bargauge", + "title": "Glue Jobs Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "targets": [ + { + "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 13, + "type": "bargauge", + "title": "Glue Jobs Last Schedule (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "targets": [ + { + "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 14, + "type": "bargauge", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 34 + }, + "targets": [ + { + "expr": "ariadne_access_requests_total", + "refId": "A", + "legendFormat": "{{status}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, "type": "stat", "title": "Ariadne CI Coverage (%)", "datasource": { @@ -568,10 +936,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 30 + "h": 6, + "w": 4, + "x": 8, + "y": 34 }, "targets": [ { @@ -623,7 +991,7 @@ } }, { - "id": 11, + "id": 16, "type": "table", "title": "Ariadne CI Tests (latest)", "datasource": { @@ -632,9 +1000,9 @@ }, "gridPos": { "h": 6, - "w": 18, - "x": 6, - "y": 30 + "w": 12, + "x": 12, + "y": 34 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 362751bb..52b28367 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -83,7 +83,7 @@ data: }, { "id": 2, - "type": "table", + "type": "stat", "title": "Glue Jobs Missing Success", "datasource": { "type": "prometheus", @@ -91,49 +91,59 @@ data: }, "gridPos": { "h": 4, - "w": 6, - "x": 6, + "w": 4, + "x": 4, "y": 0 }, "targets": [ { - "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A", - "instant": true + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 3, - "type": "table", + "type": "stat", "title": "Glue Jobs Suspended", "datasource": { "type": "prometheus", @@ -141,198 +151,238 @@ data: }, "gridPos": { "h": 4, - "w": 6, - "x": 12, + "w": 4, + "x": 8, "y": 0 }, "targets": [ { - "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1", - "refId": "A", - "instant": true + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 4, - "type": "table", - "title": "Glue Jobs Active Runs", + "type": "stat", + "title": "Ariadne Task Errors (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, - "w": 6, - "x": 18, + "w": 4, + "x": 12, "y": 0 }, "targets": [ { - "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 5, - "type": "table", - "title": "Glue Jobs Last Success (hours ago)", + "type": "stat", + "title": "Ariadne Task Errors (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 + "h": 4, + "w": 4, + "x": 16, + "y": 0 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "h", + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { "id": 6, - "type": "table", - "title": "Glue Jobs Last Schedule (hours ago)", + "type": "stat", + "title": "Ariadne Task Runs (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 + "h": 4, + "w": 4, + "x": 20, + "y": 0 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", - "refId": "A", - "instant": true + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "h", + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", "custom": { - "filterable": true + "displayMode": "auto" } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + "textMode": "value" + } }, { - "id": 12, + "id": 7, "type": "timeseries", "title": "Ariadne Task Runs vs Errors (1h)", "datasource": { @@ -343,7 +393,7 @@ data: "h": 6, "w": 24, "x": 0, - "y": 12 + "y": 4 }, "targets": [ { @@ -369,94 +419,68 @@ data: } }, { - "id": 7, - "type": "table", + "id": 8, + "type": "bargauge", "title": "Ariadne Task Errors (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 10 }, "targets": [ { "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", "refId": "A", + "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", - "custom": { - "filterable": true + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 8, - "type": "table", - "title": "Ariadne Schedule Last Success (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 18 - }, - "targets": [ - { - "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, { "id": "sortBy", "options": { @@ -470,93 +494,67 @@ data: }, { "id": 9, - "type": "table", - "title": "Ariadne Access Requests", + "type": "bargauge", + "title": "Ariadne Task Success (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 8, "w": 12, "x": 12, - "y": 24 + "y": 10 }, "targets": [ { - "expr": "ariadne_access_requests_total", + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))", "refId": "A", + "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { "unit": "none", - "custom": { - "filterable": true + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "green", + "value": 10 + } + ] } }, "overrides": [] }, "options": { - "showHeader": true, - "columnFilters": false + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 13, - "type": "table", - "title": "Ariadne Schedule Last Error (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 24 - }, - "targets": [ - { - "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, { "id": "sortBy", "options": { @@ -570,6 +568,376 @@ data: }, { "id": 10, + "type": "bargauge", + "title": "Ariadne Schedule Last Error (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 1 + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "green", + "value": 24 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 11, + "type": "bargauge", + "title": "Ariadne Schedule Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "targets": [ + { + "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 12, + "type": "bargauge", + "title": "Glue Jobs Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "targets": [ + { + "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 13, + "type": "bargauge", + "title": "Glue Jobs Last Schedule (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "targets": [ + { + "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 14, + "type": "bargauge", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 34 + }, + "targets": [ + { + "expr": "ariadne_access_requests_total", + "refId": "A", + "legendFormat": "{{status}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, "type": "stat", "title": "Ariadne CI Coverage (%)", "datasource": { @@ -577,10 +945,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 30 + "h": 6, + "w": 4, + "x": 8, + "y": 34 }, "targets": [ { @@ -632,7 +1000,7 @@ data: } }, { - "id": 11, + "id": 16, "type": "table", "title": "Ariadne CI Tests (latest)", "datasource": { @@ -641,9 +1009,9 @@ data: }, "gridPos": { "h": 6, - "w": 18, - "x": 6, - "y": 30 + "w": 12, + "x": 12, + "y": 34 }, "targets": [ { From 5fe70b14710fb08c0b1e352916e6f63016b5ea19 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 11:45:11 -0300 Subject: [PATCH 077/416] grafana: allow email-based oauth user lookup --- services/monitoring/helmrelease.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 304de05c..02bc4821 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -354,6 +354,8 @@ spec: GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "contains(groups, 'admin') && 'Admin' || 'Viewer'" GF_AUTH_GENERIC_OAUTH_USE_PKCE: "true" GF_AUTH_GENERIC_OAUTH_TLS_SKIP_VERIFY_INSECURE: "false" + GF_AUTH_GENERIC_OAUTH_ALLOW_INSECURE_EMAIL_LOOKUP: "true" + GF_AUTH_GENERIC_OAUTH_EMAIL_ATTRIBUTE_PATH: "email" GF_AUTH_SIGNOUT_REDIRECT_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/logout?redirect_uri=https://metrics.bstein.dev/" grafana.ini: server: From d9630011044cadf3c9582b37647258e36c6b7814 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:08:23 -0300 Subject: [PATCH 078/416] monitoring: add grafana user dedupe job --- .../monitoring/grafana-user-dedupe-job.yaml | 51 +++++++++++++++++++ services/monitoring/kustomization.yaml | 1 + 2 files changed, 52 insertions(+) create mode 100644 services/monitoring/grafana-user-dedupe-job.yaml diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml new file mode 100644 index 00000000..b633a19d --- /dev/null +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -0,0 +1,51 @@ +# services/monitoring/grafana-user-dedupe-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: grafana-user-dedupe + namespace: monitoring +spec: + backoffLimit: 1 + template: + spec: + restartPolicy: Never + containers: + - name: dedupe + image: alpine:3.20 + command: + - /bin/sh + - -c + args: + - | + set -euo pipefail + apk add --no-cache sqlite + db="/var/lib/grafana/grafana.db" + if [ ! -f "$db" ]; then + echo "grafana db not found at $db" + exit 1 + fi + if [ -z "${GRAFANA_DEDUPE_EMAILS}" ]; then + echo "GRAFANA_DEDUPE_EMAILS is required" + exit 1 + fi + for email in $(echo "${GRAFANA_DEDUPE_EMAILS}" | tr ',' ' '); do + ids="$(sqlite3 "$db" "select id from user where email = '${email}';")" + if [ -z "$ids" ]; then + echo "no grafana user found for ${email}" + continue + fi + echo "deleting grafana users with ids: ${ids}" + sqlite3 "$db" "delete from user_auth where user_id in (${ids});" + sqlite3 "$db" "delete from user where id in (${ids});" + done + echo "done" + env: + - name: GRAFANA_DEDUPE_EMAILS + value: brad.stein@gmail.com,brad@bstein.dev + volumeMounts: + - name: grafana-storage + mountPath: /var/lib/grafana + volumes: + - name: grafana-storage + persistentVolumeClaim: + claimName: grafana diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 7d0b01b8..86ab8269 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -24,6 +24,7 @@ resources: - grafana-folders.yaml - helmrelease.yaml - grafana-org-bootstrap.yaml + - grafana-user-dedupe-job.yaml configMapGenerator: - name: postmark-exporter-script From af789c0d0bdb172917c7f206c1ccfb0e28f32639 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:11:28 -0300 Subject: [PATCH 079/416] monitoring: dedupe grafana user via api --- .../monitoring/grafana-user-dedupe-job.yaml | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index b633a19d..833eb707 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,8 +2,17 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe + name: grafana-user-dedupe-api namespace: monitoring + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: "monitoring" + vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" + vault.hashicorp.com/agent-inject-template-grafana-env.sh: | + {{ with secret "kv/data/atlas/monitoring/grafana-admin" }} + export GRAFANA_USER="{{ index .Data.data "admin-user" }}" + export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}" + {{ end }} spec: backoffLimit: 1 template: @@ -18,10 +27,15 @@ spec: args: - | set -euo pipefail - apk add --no-cache sqlite - db="/var/lib/grafana/grafana.db" - if [ ! -f "$db" ]; then - echo "grafana db not found at $db" + apk add --no-cache curl jq + . /vault/secrets/grafana-env.sh + grafana_url="${GRAFANA_URL}" + if [ -z "${grafana_url}" ]; then + echo "GRAFANA_URL is required" + exit 1 + fi + if [ -z "${GRAFANA_USER}" ] || [ -z "${GRAFANA_PASSWORD}" ]; then + echo "Grafana admin credentials missing" exit 1 fi if [ -z "${GRAFANA_DEDUPE_EMAILS}" ]; then @@ -29,23 +43,19 @@ spec: exit 1 fi for email in $(echo "${GRAFANA_DEDUPE_EMAILS}" | tr ',' ' '); do - ids="$(sqlite3 "$db" "select id from user where email = '${email}';")" - if [ -z "$ids" ]; then + user_id="$(curl -sf -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \ + "${grafana_url}/api/users/lookup?loginOrEmail=${email}" | jq -r '.id // empty')" + if [ -z "$user_id" ]; then echo "no grafana user found for ${email}" continue fi - echo "deleting grafana users with ids: ${ids}" - sqlite3 "$db" "delete from user_auth where user_id in (${ids});" - sqlite3 "$db" "delete from user where id in (${ids});" + echo "deleting grafana user ${user_id} (${email})" + curl -sf -X DELETE -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \ + "${grafana_url}/api/admin/users/${user_id}" done echo "done" env: + - name: GRAFANA_URL + value: http://grafana - name: GRAFANA_DEDUPE_EMAILS value: brad.stein@gmail.com,brad@bstein.dev - volumeMounts: - - name: grafana-storage - mountPath: /var/lib/grafana - volumes: - - name: grafana-storage - persistentVolumeClaim: - claimName: grafana From 2f37a4786934c3b631dd2b31d20b8e550c7bf67c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:15:03 -0300 Subject: [PATCH 080/416] monitoring: use python dedupe job --- .../monitoring/grafana-user-dedupe-job.yaml | 51 ++++++++++++++----- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index 833eb707..f3a1c261 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api + name: grafana-user-dedupe-api-v2 namespace: monitoring annotations: vault.hashicorp.com/agent-inject: "true" @@ -20,14 +20,13 @@ spec: restartPolicy: Never containers: - name: dedupe - image: alpine:3.20 + image: python:3.12-slim command: - /bin/sh - -c args: - | set -euo pipefail - apk add --no-cache curl jq . /vault/secrets/grafana-env.sh grafana_url="${GRAFANA_URL}" if [ -z "${grafana_url}" ]; then @@ -42,17 +41,41 @@ spec: echo "GRAFANA_DEDUPE_EMAILS is required" exit 1 fi - for email in $(echo "${GRAFANA_DEDUPE_EMAILS}" | tr ',' ' '); do - user_id="$(curl -sf -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \ - "${grafana_url}/api/users/lookup?loginOrEmail=${email}" | jq -r '.id // empty')" - if [ -z "$user_id" ]; then - echo "no grafana user found for ${email}" - continue - fi - echo "deleting grafana user ${user_id} (${email})" - curl -sf -X DELETE -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \ - "${grafana_url}/api/admin/users/${user_id}" - done + python - <<'PY' + import base64 + import json + import os + import urllib.parse + import urllib.request + + grafana_url = os.environ["GRAFANA_URL"].rstrip("/") + user = os.environ["GRAFANA_USER"] + password = os.environ["GRAFANA_PASSWORD"] + emails = [e.strip() for e in os.environ["GRAFANA_DEDUPE_EMAILS"].split(",") if e.strip()] + + token = base64.b64encode(f"{user}:{password}".encode("utf-8")).decode("utf-8") + headers = {"Authorization": f"Basic {token}"} + + def request(method: str, url: str): + req = urllib.request.Request(url, headers=headers, method=method) + with urllib.request.urlopen(req, timeout=10) as resp: + return resp.read() + + for email in emails: + lookup_url = f"{grafana_url}/api/users/lookup?loginOrEmail={urllib.parse.quote(email)}" + try: + payload = json.loads(request("GET", lookup_url)) + except Exception: + print(f"no grafana user found for {email}") + continue + user_id = payload.get("id") + if not user_id: + print(f"no grafana user found for {email}") + continue + print(f"deleting grafana user {user_id} ({email})") + delete_url = f"{grafana_url}/api/admin/users/{user_id}" + request("DELETE", delete_url) + PY echo "done" env: - name: GRAFANA_URL From 10704a22d6e7cb732c391d3ea536e29b0065ba5d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:16:26 -0300 Subject: [PATCH 081/416] monitoring: wire vault sa for dedupe job --- services/monitoring/grafana-user-dedupe-job.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index f3a1c261..631c25d0 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api-v2 + name: grafana-user-dedupe-api-v3 namespace: monitoring annotations: vault.hashicorp.com/agent-inject: "true" @@ -17,6 +17,8 @@ spec: backoffLimit: 1 template: spec: + serviceAccountName: monitoring-vault-sync + automountServiceAccountToken: true restartPolicy: Never containers: - name: dedupe From e8859e605a6dec278ab82fe4d855ab025d0ba708 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:18:57 -0300 Subject: [PATCH 082/416] monitoring: prepopulate vault for dedupe job --- .../monitoring/grafana-user-dedupe-job.yaml | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index 631c25d0..3eb001b8 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,20 +2,23 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api-v3 + name: grafana-user-dedupe-api-v4 namespace: monitoring - annotations: - vault.hashicorp.com/agent-inject: "true" - vault.hashicorp.com/role: "monitoring" - vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" - vault.hashicorp.com/agent-inject-template-grafana-env.sh: | - {{ with secret "kv/data/atlas/monitoring/grafana-admin" }} - export GRAFANA_USER="{{ index .Data.data "admin-user" }}" - export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}" - {{ end }} spec: backoffLimit: 1 template: + metadata: + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/agent-pre-populate: "true" + vault.hashicorp.com/agent-pre-populate-only: "true" + vault.hashicorp.com/role: "monitoring" + vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" + vault.hashicorp.com/agent-inject-template-grafana-env.sh: | + {{ with secret "kv/data/atlas/monitoring/grafana-admin" }} + export GRAFANA_USER="{{ index .Data.data "admin-user" }}" + export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}" + {{ end }} spec: serviceAccountName: monitoring-vault-sync automountServiceAccountToken: true From d89d441486ff8819de1e264907d27ff67f227428 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:25:53 -0300 Subject: [PATCH 083/416] monitoring: fix grafana user dedupe job --- services/monitoring/grafana-user-dedupe-job.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index 3eb001b8..e56362b9 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api-v4 + name: grafana-user-dedupe-api-v5 namespace: monitoring spec: backoffLimit: 1 @@ -10,7 +10,6 @@ spec: metadata: annotations: vault.hashicorp.com/agent-inject: "true" - vault.hashicorp.com/agent-pre-populate: "true" vault.hashicorp.com/agent-pre-populate-only: "true" vault.hashicorp.com/role: "monitoring" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" @@ -32,6 +31,16 @@ spec: args: - | set -euo pipefail + for _ in $(seq 1 30); do + if [ -f /vault/secrets/grafana-env.sh ]; then + break + fi + sleep 1 + done + if [ ! -f /vault/secrets/grafana-env.sh ]; then + echo "Vault secret not available" + exit 1 + fi . /vault/secrets/grafana-env.sh grafana_url="${GRAFANA_URL}" if [ -z "${grafana_url}" ]; then From 190caf172949a0dc9d73c11f120f2e9c962bc3a0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:30:08 -0300 Subject: [PATCH 084/416] monitoring: harden grafana user dedupe --- .../monitoring/grafana-user-dedupe-job.yaml | 63 ++++++++++++++----- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index e56362b9..1d1bd090 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api-v5 + name: grafana-user-dedupe-api-v6 namespace: monitoring spec: backoffLimit: 1 @@ -60,35 +60,66 @@ spec: import json import os import urllib.parse + import urllib.error import urllib.request grafana_url = os.environ["GRAFANA_URL"].rstrip("/") user = os.environ["GRAFANA_USER"] password = os.environ["GRAFANA_PASSWORD"] - emails = [e.strip() for e in os.environ["GRAFANA_DEDUPE_EMAILS"].split(",") if e.strip()] + lookups = [e.strip() for e in os.environ["GRAFANA_DEDUPE_EMAILS"].split(",") if e.strip()] token = base64.b64encode(f"{user}:{password}".encode("utf-8")).decode("utf-8") headers = {"Authorization": f"Basic {token}"} def request(method: str, url: str): req = urllib.request.Request(url, headers=headers, method=method) - with urllib.request.urlopen(req, timeout=10) as resp: - return resp.read() - - for email in emails: - lookup_url = f"{grafana_url}/api/users/lookup?loginOrEmail={urllib.parse.quote(email)}" try: - payload = json.loads(request("GET", lookup_url)) - except Exception: - print(f"no grafana user found for {email}") + with urllib.request.urlopen(req, timeout=10) as resp: + return resp.status, resp.read() + except urllib.error.HTTPError as err: + body = err.read() + return err.code, body + + for _ in range(60): + status, _ = request("GET", f"{grafana_url}/api/health") + if status == 200: + break + else: + raise SystemExit("Grafana API did not become ready in time") + + for lookup in lookups: + search_url = f"{grafana_url}/api/users/search?query={urllib.parse.quote(lookup)}" + status, body = request("GET", search_url) + if status != 200: + print(f"search failed for {lookup}: status={status} body={body.decode('utf-8', errors='ignore')}") continue - user_id = payload.get("id") - if not user_id: - print(f"no grafana user found for {email}") + payload = json.loads(body) + users = payload.get("users", []) + matches = [ + user + for user in users + if user.get("email", "").lower() == lookup.lower() + or user.get("login", "").lower() == lookup.lower() + ] + if not matches: + print(f"no grafana user found for {lookup}") continue - print(f"deleting grafana user {user_id} ({email})") - delete_url = f"{grafana_url}/api/admin/users/{user_id}" - request("DELETE", delete_url) + for user in matches: + user_id = user.get("id") + if not user_id: + continue + print(f"deleting grafana user {user_id} ({user.get('email')})") + delete_url = f"{grafana_url}/api/admin/users/{user_id}" + del_status, del_body = request("DELETE", delete_url) + if del_status not in (200, 202, 204): + print( + "delete failed for", + user_id, + "status", + del_status, + "body", + del_body.decode("utf-8", errors="ignore"), + ) PY echo "done" env: From 4699ffbf2c278c2fd4efb3a58fde60b71a20a37f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 12:31:54 -0300 Subject: [PATCH 085/416] monitoring: reschedule grafana user dedupe --- .../monitoring/grafana-user-dedupe-job.yaml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml index 1d1bd090..8ab1a665 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/grafana-user-dedupe-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: grafana-user-dedupe-api-v6 + name: grafana-user-dedupe-api-v7 namespace: monitoring spec: backoffLimit: 1 @@ -22,6 +22,20 @@ spec: serviceAccountName: monitoring-vault-sync automountServiceAccountToken: true restartPolicy: Never + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: kubernetes.io/arch + operator: In + values: ["arm64"] containers: - name: dedupe image: python:3.12-slim @@ -124,6 +138,6 @@ spec: echo "done" env: - name: GRAFANA_URL - value: http://grafana + value: http://grafana.monitoring.svc.cluster.local - name: GRAFANA_DEDUPE_EMAILS value: brad.stein@gmail.com,brad@bstein.dev From fc87432fdfbd11e73fa68bb0d212a8b2b0a405a7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 13:37:36 -0300 Subject: [PATCH 086/416] monitoring: refresh jobs dashboards --- scripts/dashboards_render_atlas.py | 330 ++++-- services/maintenance/ariadne-deployment.yaml | 8 +- .../{atlas-testing.json => atlas-jobs.json} | 1034 +++++++++------- .../monitoring/dashboards/atlas-overview.json | 284 ++++- ...sting.yaml => grafana-dashboard-jobs.yaml} | 1040 ++++++++++------- .../grafana-dashboard-overview.yaml | 284 ++++- services/monitoring/helmrelease.yaml | 6 +- services/monitoring/kustomization.yaml | 2 +- 8 files changed, 1946 insertions(+), 1042 deletions(-) rename services/monitoring/dashboards/{atlas-testing.json => atlas-jobs.json} (84%) rename services/monitoring/{grafana-dashboard-testing.yaml => grafana-dashboard-jobs.yaml} (84%) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 6eaafb46..1235a0aa 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -337,16 +337,39 @@ GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' +ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))' +ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))' ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))' ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))' +ARIADNE_TASK_ATTEMPTS_1H = 'sum(increase(ariadne_task_runs_total[1h]))' +ARIADNE_TASK_FAILURES_1H = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' +ARIADNE_TEST_SUCCESS_RATE = ( + "100 * " + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[1h])) ' + "/ clamp_min(" + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[1h])), 1)' +) +ARIADNE_TEST_FAILURES_24H = ( + 'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' +) +ONEOFF_JOB_OWNER = ( + 'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")' +) +ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})' +ONEOFF_JOB_POD_AGE_HOURS = ( + '((time() - kube_pod_start_time{pod!=""}) / 3600) ' + f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} ' + '* on(namespace,pod) group_left(phase) ' + 'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})' +) GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -798,6 +821,15 @@ def build_overview(): {"color": "red", "value": 3}, ], } + age_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 6}, + {"color": "orange", "value": 24}, + {"color": "red", "value": 48}, + ], + } row1_stats = [ { @@ -1000,7 +1032,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 2, "w": 5, "x": 0, "y": 8}, + {"h": 3, "w": 5, "x": 0, "y": 8}, unit="none", links=link_to("atlas-mail"), ) @@ -1011,7 +1043,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 2, "w": 5, "x": 10, "y": 8}, + "gridPos": {"h": 3, "w": 5, "x": 10, "y": 8}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1057,7 +1089,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 2, "w": 5, "x": 5, "y": 8}, + {"h": 3, "w": 5, "x": 5, "y": 8}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1069,7 +1101,7 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 2, "w": 5, "x": 15, "y": 8}, + {"h": 3, "w": 5, "x": 15, "y": 8}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, @@ -1089,13 +1121,76 @@ def build_overview(): panel_id, title, expr, - {"h": 6, "w": 6, "x": 6 * idx, "y": 10}, + {"h": 5, "w": 6, "x": 6 * idx, "y": 11}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, links=link_to("atlas-storage"), ) ) + panels.append( + bargauge_panel( + 40, + "One-off Job Pods (age hours)", + ONEOFF_JOB_POD_AGE_HOURS, + {"h": 6, "w": 4, "x": 0, "y": 16}, + unit="h", + instant=True, + legend="{{namespace}}/{{pod}}", + thresholds=age_thresholds, + limit=8, + ) + ) + panels.append( + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": PROM_DS, + "gridPos": {"h": 6, "w": 8, "x": 4, "y": 16}, + "targets": [ + {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + ], + "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "options": { + "legend": {"displayMode": "table", "placement": "right"}, + "tooltip": {"mode": "multi"}, + }, + } + ) + panels.append( + timeseries_panel( + 42, + "Ariadne Test Success Rate", + ARIADNE_TEST_SUCCESS_RATE, + {"h": 6, "w": 8, "x": 12, "y": 16}, + unit="percent", + legend=None, + legend_display="list", + ) + ) + panels.append( + bargauge_panel( + 43, + "Tests with Failures (24h)", + ARIADNE_TEST_FAILURES_24H, + {"h": 6, "w": 4, "x": 20, "y": 16}, + unit="none", + instant=True, + legend="{{result}}", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 5}, + {"color": "red", "value": 10}, + ], + }, + ) + ) + cpu_scope = "$namespace_scope_cpu" gpu_scope = "$namespace_scope_gpu" ram_scope = "$namespace_scope_ram" @@ -1105,7 +1200,7 @@ def build_overview(): 11, "Namespace CPU Share", namespace_cpu_share_expr(cpu_scope), - {"h": 9, "w": 8, "x": 0, "y": 16}, + {"h": 9, "w": 8, "x": 0, "y": 22}, links=namespace_scope_links("namespace_scope_cpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1115,7 +1210,7 @@ def build_overview(): 12, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), - {"h": 9, "w": 8, "x": 8, "y": 16}, + {"h": 9, "w": 8, "x": 8, "y": 22}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1125,7 +1220,7 @@ def build_overview(): 13, "Namespace RAM Share", namespace_ram_share_expr(ram_scope), - {"h": 9, "w": 8, "x": 16, "y": 16}, + {"h": 9, "w": 8, "x": 16, "y": 22}, links=namespace_scope_links("namespace_scope_ram"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1137,7 +1232,7 @@ def build_overview(): 14, "Worker Node CPU", node_cpu_expr(worker_filter), - {"h": 12, "w": 12, "x": 0, "y": 32}, + {"h": 12, "w": 12, "x": 0, "y": 38}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1151,7 +1246,7 @@ def build_overview(): 15, "Worker Node RAM", node_mem_expr(worker_filter), - {"h": 12, "w": 12, "x": 12, "y": 32}, + {"h": 12, "w": 12, "x": 12, "y": 38}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1166,7 +1261,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 0, "y": 44}, + {"h": 10, "w": 12, "x": 0, "y": 50}, unit="percent", legend="{{node}}", legend_display="table", @@ -1178,7 +1273,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 12, "y": 44}, + {"h": 10, "w": 12, "x": 12, "y": 50}, unit="percent", legend="{{node}}", legend_display="table", @@ -1191,7 +1286,7 @@ def build_overview(): 28, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', - {"h": 10, "w": 12, "x": 0, "y": 54}, + {"h": 10, "w": 12, "x": 0, "y": 60}, ) ) panels.append( @@ -1199,7 +1294,7 @@ def build_overview(): 29, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', - {"h": 10, "w": 12, "x": 12, "y": 54}, + {"h": 10, "w": 12, "x": 12, "y": 60}, unit="none", limit=12, decimals=0, @@ -1221,7 +1316,7 @@ def build_overview(): 18, "Cluster Ingress Throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 8, "x": 0, "y": 25}, + {"h": 7, "w": 8, "x": 0, "y": 31}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", @@ -1234,7 +1329,7 @@ def build_overview(): 19, "Cluster Egress Throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 8, "x": 8, "y": 25}, + {"h": 7, "w": 8, "x": 8, "y": 31}, unit="Bps", legend="Egress (Traefik)", legend_display="list", @@ -1247,7 +1342,7 @@ def build_overview(): 20, "Intra-Cluster Throughput", NET_INTERNAL_EXPR, - {"h": 7, "w": 8, "x": 16, "y": 25}, + {"h": 7, "w": 8, "x": 16, "y": 31}, unit="Bps", legend="Internal traffic", legend_display="list", @@ -1261,7 +1356,7 @@ def build_overview(): 21, "Root Filesystem Usage", root_usage_expr(), - {"h": 16, "w": 12, "x": 0, "y": 64}, + {"h": 16, "w": 12, "x": 0, "y": 70}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1276,7 +1371,7 @@ def build_overview(): 22, "Nodes Closest to Full Root Disks", f"topk(12, {root_usage_expr()})", - {"h": 16, "w": 12, "x": 12, "y": 64}, + {"h": 16, "w": 12, "x": 12, "y": 70}, unit="percent", thresholds=PERCENT_THRESHOLDS, links=link_to("atlas-storage"), @@ -2171,7 +2266,7 @@ def build_mail_dashboard(): } -def build_testing_dashboard(): +def build_jobs_dashboard(): panels = [] age_thresholds = { "mode": "absolute", @@ -2192,12 +2287,65 @@ def build_testing_dashboard(): ], } + task_error_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 3}, + {"color": "red", "value": 5}, + ], + } + + panels.append( + bargauge_panel( + 1, + "Ariadne Task Errors (24h)", + ARIADNE_TASK_ERRORS_24H, + {"h": 7, "w": 6, "x": 0, "y": 0}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + { + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": PROM_DS, + "gridPos": {"h": 7, "w": 12, "x": 6, "y": 0}, + "targets": [ + {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + ], + "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "options": { + "legend": {"displayMode": "table", "placement": "right"}, + "tooltip": {"mode": "multi"}, + }, + } + ) + panels.append( + bargauge_panel( + 3, + "One-off Job Pods (age hours)", + ONEOFF_JOB_POD_AGE_HOURS, + {"h": 7, "w": 6, "x": 18, "y": 0}, + unit="h", + instant=True, + legend="{{namespace}}/{{pod}}", + thresholds=age_thresholds, + limit=12, + ) + ) panels.append( stat_panel( - 1, + 4, "Glue Jobs Stale (>36h)", GLUE_STALE_COUNT, - {"h": 4, "w": 6, "x": 0, "y": 0}, + {"h": 4, "w": 4, "x": 0, "y": 7}, unit="none", thresholds={ "mode": "absolute", @@ -2212,99 +2360,47 @@ def build_testing_dashboard(): ) panels.append( stat_panel( - 2, + 5, "Glue Jobs Missing Success", GLUE_MISSING_COUNT, - {"h": 4, "w": 4, "x": 4, "y": 0}, - unit="none", - ) - ) - panels.append( - stat_panel( - 3, - "Glue Jobs Suspended", - GLUE_SUSPENDED_COUNT, - {"h": 4, "w": 4, "x": 8, "y": 0}, - unit="none", - ) - ) - panels.append( - stat_panel( - 4, - "Ariadne Task Errors (1h)", - ARIADNE_TASK_ERRORS_1H_TOTAL, - {"h": 4, "w": 4, "x": 12, "y": 0}, - unit="none", - ) - ) - panels.append( - stat_panel( - 5, - "Ariadne Task Errors (24h)", - ARIADNE_TASK_ERRORS_24H_TOTAL, - {"h": 4, "w": 4, "x": 16, "y": 0}, + {"h": 4, "w": 4, "x": 4, "y": 7}, unit="none", ) ) panels.append( stat_panel( 6, - "Ariadne Task Runs (1h)", - ARIADNE_TASK_RUNS_1H_TOTAL, - {"h": 4, "w": 4, "x": 20, "y": 0}, + "Glue Jobs Suspended", + GLUE_SUSPENDED_COUNT, + {"h": 4, "w": 4, "x": 8, "y": 7}, unit="none", ) ) panels.append( - timeseries_panel( + stat_panel( 7, - "Ariadne Task Runs vs Errors (1h)", - ARIADNE_TASK_RUNS_BY_STATUS_1H, - {"h": 6, "w": 24, "x": 0, "y": 4}, + "Ariadne Task Errors (1h)", + ARIADNE_TASK_ERRORS_1H_TOTAL, + {"h": 4, "w": 4, "x": 12, "y": 7}, unit="none", - legend="{{status}}", - legend_display="table", - legend_placement="right", ) ) panels.append( - bargauge_panel( + stat_panel( 8, "Ariadne Task Errors (24h)", - ARIADNE_TASK_ERRORS_24H, - {"h": 8, "w": 12, "x": 0, "y": 10}, + ARIADNE_TASK_ERRORS_24H_TOTAL, + {"h": 4, "w": 4, "x": 16, "y": 7}, unit="none", - instant=True, - legend="{{task}}", - thresholds={ - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 1}, - {"color": "orange", "value": 3}, - {"color": "red", "value": 5}, - ], - }, ) ) panels.append( - bargauge_panel( + stat_panel( 9, - "Ariadne Task Success (24h)", - ARIADNE_TASK_SUCCESS_24H, - {"h": 8, "w": 12, "x": 12, "y": 10}, + "Ariadne Task Runs (1h)", + ARIADNE_TASK_RUNS_1H_TOTAL, + {"h": 4, "w": 4, "x": 20, "y": 7}, unit="none", - instant=True, - legend="{{task}}", - thresholds={ - "mode": "absolute", - "steps": [ - {"color": "red", "value": None}, - {"color": "orange", "value": 1}, - {"color": "yellow", "value": 5}, - {"color": "green", "value": 10}, - ], - }, ) ) panels.append( @@ -2312,7 +2408,7 @@ def build_testing_dashboard(): 10, "Ariadne Schedule Last Error (hours ago)", ARIADNE_SCHEDULE_LAST_ERROR_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 18}, + {"h": 8, "w": 12, "x": 0, "y": 11}, unit="h", instant=True, legend="{{task}}", @@ -2324,7 +2420,7 @@ def build_testing_dashboard(): 11, "Ariadne Schedule Last Success (hours ago)", ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 18}, + {"h": 8, "w": 12, "x": 12, "y": 11}, unit="h", instant=True, legend="{{task}}", @@ -2336,7 +2432,7 @@ def build_testing_dashboard(): 12, "Glue Jobs Last Success (hours ago)", GLUE_LAST_SUCCESS_AGE_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 26}, + {"h": 8, "w": 12, "x": 0, "y": 19}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", @@ -2348,7 +2444,7 @@ def build_testing_dashboard(): 13, "Glue Jobs Last Schedule (hours ago)", GLUE_LAST_SCHEDULE_AGE_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 26}, + {"h": 8, "w": 12, "x": 12, "y": 19}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", @@ -2358,9 +2454,33 @@ def build_testing_dashboard(): panels.append( bargauge_panel( 14, + "Ariadne Task Errors (1h)", + ARIADNE_TASK_ERRORS_1H, + {"h": 8, "w": 12, "x": 0, "y": 27}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + bargauge_panel( + 15, + "Ariadne Task Errors (30d)", + ARIADNE_TASK_ERRORS_30D, + {"h": 8, "w": 12, "x": 12, "y": 27}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + ) + panels.append( + bargauge_panel( + 16, "Ariadne Access Requests", ARIADNE_ACCESS_REQUESTS, - {"h": 6, "w": 8, "x": 0, "y": 34}, + {"h": 6, "w": 8, "x": 0, "y": 35}, unit="none", instant=True, legend="{{status}}", @@ -2368,10 +2488,10 @@ def build_testing_dashboard(): ) panels.append( stat_panel( - 15, + 17, "Ariadne CI Coverage (%)", ARIADNE_CI_COVERAGE, - {"h": 6, "w": 4, "x": 8, "y": 34}, + {"h": 6, "w": 4, "x": 8, "y": 35}, unit="percent", decimals=1, instant=True, @@ -2380,10 +2500,10 @@ def build_testing_dashboard(): ) panels.append( table_panel( - 16, + 18, "Ariadne CI Tests (latest)", ARIADNE_CI_TESTS, - {"h": 6, "w": 12, "x": 12, "y": 34}, + {"h": 6, "w": 12, "x": 12, "y": 35}, unit="none", transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], instant=True, @@ -2391,8 +2511,8 @@ def build_testing_dashboard(): ) return { - "uid": "atlas-testing", - "title": "Atlas Testing", + "uid": "atlas-jobs", + "title": "Atlas Jobs", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, @@ -2400,7 +2520,7 @@ def build_testing_dashboard(): "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", - "tags": ["atlas", "testing"], + "tags": ["atlas", "jobs", "glue"], } @@ -2497,9 +2617,9 @@ DASHBOARDS = { "builder": build_mail_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml", }, - "atlas-testing": { - "builder": build_testing_dashboard, - "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml", + "atlas-jobs": { + "builder": build_jobs_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml", }, "atlas-gpu": { "builder": build_gpu_dashboard, diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 069f3885..01e940cf 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -270,7 +270,7 @@ spec: - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE value: "30 4 * * *" - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC - value: "*/15 * * * *" + value: "0 * * * *" - name: ARIADNE_SCHEDULE_WGER_USER_SYNC value: "0 5 * * *" - name: ARIADNE_SCHEDULE_WGER_ADMIN @@ -286,11 +286,11 @@ spec: - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER value: "30 4 * * 0" - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH - value: "*/15 * * * *" + value: "0 * * * *" - name: ARIADNE_SCHEDULE_VAULT_OIDC - value: "*/15 * * * *" + value: "0 * * * *" - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME - value: "*/1 * * * *" + value: "*/5 * * * *" - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE value: "*/30 * * * *" - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-jobs.json similarity index 84% rename from services/monitoring/dashboards/atlas-testing.json rename to services/monitoring/dashboards/atlas-jobs.json index 420abf26..76e21f01 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -1,416 +1,11 @@ { - "uid": "atlas-testing", - "title": "Atlas Testing", + "uid": "atlas-jobs", + "title": "Atlas Jobs", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, - "type": "stat", - "title": "Glue Jobs Stale (>36h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 2, - "type": "stat", - "title": "Glue Jobs Missing Success", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 0 - }, - "targets": [ - { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 3, - "type": "stat", - "title": "Glue Jobs Suspended", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 0 - }, - "targets": [ - { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 4, - "type": "stat", - "title": "Ariadne Task Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 5, - "type": "stat", - "title": "Ariadne Task Errors (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 16, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 6, - "type": "stat", - "title": "Ariadne Task Runs (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, - "type": "timeseries", - "title": "Ariadne Task Runs vs Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 4 - }, - "targets": [ - { - "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))", - "refId": "A", - "legendFormat": "{{status}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 8, "type": "bargauge", "title": "Ariadne Task Errors (24h)", "datasource": { @@ -418,10 +13,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, + "h": 7, + "w": 6, "x": 0, - "y": 10 + "y": 0 }, "targets": [ { @@ -484,50 +79,92 @@ ] }, { - "id": 9, - "type": "bargauge", - "title": "Ariadne Task Success (24h)", + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 10 + "x": 6, + "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))", + "expr": "sum(increase(ariadne_task_runs_total[1h]))", "refId": "A", - "legendFormat": "{{task}}", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", "instant": true } ], "fieldConfig": { "defaults": { - "unit": "none", + "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { - "color": "red", + "color": "green", "value": null }, - { - "color": "orange", - "value": 1 - }, { "color": "yellow", - "value": 5 + "value": 6 }, { - "color": "green", - "value": 10 + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 } ] } @@ -554,9 +191,383 @@ ], "order": "desc" } + }, + { + "id": "limit", + "options": { + "limit": 12 + } } ] }, + { + "id": 4, + "type": "stat", + "title": "Glue Jobs Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 7 + }, + "targets": [ + { + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 7 + }, + "targets": [ + { + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 7 + }, + "targets": [ + { + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 8, + "type": "stat", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 9, + "type": "stat", + "title": "Ariadne Task Runs (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 10, "type": "bargauge", @@ -569,7 +580,7 @@ "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 11 }, "targets": [ { @@ -643,7 +654,7 @@ "h": 8, "w": 12, "x": 12, - "y": 18 + "y": 11 }, "targets": [ { @@ -717,7 +728,7 @@ "h": 8, "w": 12, "x": 0, - "y": 26 + "y": 19 }, "targets": [ { @@ -791,7 +802,7 @@ "h": 8, "w": 12, "x": 12, - "y": 26 + "y": 19 }, "targets": [ { @@ -856,6 +867,154 @@ { "id": 14, "type": "bargauge", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, + "type": "bargauge", + "title": "Ariadne Task Errors (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 16, + "type": "bargauge", "title": "Ariadne Access Requests", "datasource": { "type": "prometheus", @@ -865,7 +1024,7 @@ "h": 6, "w": 8, "x": 0, - "y": 34 + "y": 35 }, "targets": [ { @@ -928,7 +1087,7 @@ ] }, { - "id": 15, + "id": 17, "type": "stat", "title": "Ariadne CI Coverage (%)", "datasource": { @@ -939,7 +1098,7 @@ "h": 6, "w": 4, "x": 8, - "y": 34 + "y": 35 }, "targets": [ { @@ -991,7 +1150,7 @@ } }, { - "id": 16, + "id": 18, "type": "table", "title": "Ariadne CI Tests (latest)", "datasource": { @@ -1002,7 +1161,7 @@ "h": 6, "w": 12, "x": 12, - "y": 34 + "y": 35 }, "targets": [ { @@ -1052,6 +1211,7 @@ "style": "dark", "tags": [ "atlas", - "testing" + "jobs", + "glue" ] } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index c5f30d1f..c3ff327d 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -795,7 +795,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 0, "y": 8 @@ -862,7 +862,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 10, "y": 8 @@ -967,7 +967,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 5, "y": 8 @@ -1043,7 +1043,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 15, "y": 8 @@ -1119,10 +1119,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 0, - "y": 10 + "y": 11 }, "targets": [ { @@ -1194,10 +1194,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 6, - "y": 10 + "y": 11 }, "targets": [ { @@ -1269,10 +1269,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 12, - "y": 10 + "y": 11 }, "targets": [ { @@ -1336,10 +1336,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 18, - "y": 10 + "y": 11 }, "targets": [ { @@ -1394,6 +1394,238 @@ } ] }, + { + "id": 40, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 8 + } + } + ] + }, + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 4, + "y": 16 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 42, + "type": "timeseries", + "title": "Ariadne Test Success Rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 12, + "y": 16 + }, + "targets": [ + { + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 43, + "type": "bargauge", + "title": "Tests with Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 20, + "y": 16 + }, + "targets": [ + { + "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "refId": "A", + "legendFormat": "{{result}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 11, "type": "piechart", @@ -1406,7 +1638,7 @@ "h": 9, "w": 8, "x": 0, - "y": 16 + "y": 22 }, "targets": [ { @@ -1475,7 +1707,7 @@ "h": 9, "w": 8, "x": 8, - "y": 16 + "y": 22 }, "targets": [ { @@ -1544,7 +1776,7 @@ "h": 9, "w": 8, "x": 16, - "y": 16 + "y": 22 }, "targets": [ { @@ -1613,7 +1845,7 @@ "h": 12, "w": 12, "x": 0, - "y": 32 + "y": 38 }, "targets": [ { @@ -1660,7 +1892,7 @@ "h": 12, "w": 12, "x": 12, - "y": 32 + "y": 38 }, "targets": [ { @@ -1707,7 +1939,7 @@ "h": 10, "w": 12, "x": 0, - "y": 44 + "y": 50 }, "targets": [ { @@ -1744,7 +1976,7 @@ "h": 10, "w": 12, "x": 12, - "y": 44 + "y": 50 }, "targets": [ { @@ -1781,7 +2013,7 @@ "h": 10, "w": 12, "x": 0, - "y": 54 + "y": 60 }, "targets": [ { @@ -1832,7 +2064,7 @@ "h": 10, "w": 12, "x": 12, - "y": 54 + "y": 60 }, "targets": [ { @@ -1913,7 +2145,7 @@ "h": 7, "w": 8, "x": 0, - "y": 25 + "y": 31 }, "targets": [ { @@ -1957,7 +2189,7 @@ "h": 7, "w": 8, "x": 8, - "y": 25 + "y": 31 }, "targets": [ { @@ -2001,7 +2233,7 @@ "h": 7, "w": 8, "x": 16, - "y": 25 + "y": 31 }, "targets": [ { @@ -2045,7 +2277,7 @@ "h": 16, "w": 12, "x": 0, - "y": 64 + "y": 70 }, "targets": [ { @@ -2093,7 +2325,7 @@ "h": 16, "w": 12, "x": 12, - "y": 64 + "y": 70 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-jobs.yaml similarity index 84% rename from services/monitoring/grafana-dashboard-testing.yaml rename to services/monitoring/grafana-dashboard-jobs.yaml index 52b28367..19e0d4eb 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -1,425 +1,20 @@ -# services/monitoring/grafana-dashboard-testing.yaml +# services/monitoring/grafana-dashboard-jobs.yaml apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboard-testing + name: grafana-dashboard-jobs labels: grafana_dashboard: "1" data: - atlas-testing.json: | + atlas-jobs.json: | { - "uid": "atlas-testing", - "title": "Atlas Testing", + "uid": "atlas-jobs", + "title": "Atlas Jobs", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, - "type": "stat", - "title": "Glue Jobs Stale (>36h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 2, - "type": "stat", - "title": "Glue Jobs Missing Success", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 0 - }, - "targets": [ - { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 3, - "type": "stat", - "title": "Glue Jobs Suspended", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 0 - }, - "targets": [ - { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 4, - "type": "stat", - "title": "Ariadne Task Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 5, - "type": "stat", - "title": "Ariadne Task Errors (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 16, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 6, - "type": "stat", - "title": "Ariadne Task Runs (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, - "type": "timeseries", - "title": "Ariadne Task Runs vs Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 4 - }, - "targets": [ - { - "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))", - "refId": "A", - "legendFormat": "{{status}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 8, "type": "bargauge", "title": "Ariadne Task Errors (24h)", "datasource": { @@ -427,10 +22,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, + "h": 7, + "w": 6, "x": 0, - "y": 10 + "y": 0 }, "targets": [ { @@ -493,50 +88,92 @@ data: ] }, { - "id": 9, - "type": "bargauge", - "title": "Ariadne Task Success (24h)", + "id": 2, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 7, "w": 12, - "x": 12, - "y": 10 + "x": 6, + "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))", + "expr": "sum(increase(ariadne_task_runs_total[1h]))", "refId": "A", - "legendFormat": "{{task}}", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", "instant": true } ], "fieldConfig": { "defaults": { - "unit": "none", + "unit": "h", "min": 0, "max": null, "thresholds": { "mode": "absolute", "steps": [ { - "color": "red", + "color": "green", "value": null }, - { - "color": "orange", - "value": 1 - }, { "color": "yellow", - "value": 5 + "value": 6 }, { - "color": "green", - "value": 10 + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 } ] } @@ -563,9 +200,383 @@ data: ], "order": "desc" } + }, + { + "id": "limit", + "options": { + "limit": 12 + } } ] }, + { + "id": 4, + "type": "stat", + "title": "Glue Jobs Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 7 + }, + "targets": [ + { + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Glue Jobs Missing Success", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 7 + }, + "targets": [ + { + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Glue Jobs Suspended", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 7 + }, + "targets": [ + { + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 8, + "type": "stat", + "title": "Ariadne Task Errors (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 9, + "type": "stat", + "title": "Ariadne Task Runs (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 7 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 10, "type": "bargauge", @@ -578,7 +589,7 @@ data: "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 11 }, "targets": [ { @@ -652,7 +663,7 @@ data: "h": 8, "w": 12, "x": 12, - "y": 18 + "y": 11 }, "targets": [ { @@ -726,7 +737,7 @@ data: "h": 8, "w": 12, "x": 0, - "y": 26 + "y": 19 }, "targets": [ { @@ -800,7 +811,7 @@ data: "h": 8, "w": 12, "x": 12, - "y": 26 + "y": 19 }, "targets": [ { @@ -865,6 +876,154 @@ data: { "id": 14, "type": "bargauge", + "title": "Ariadne Task Errors (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, + "type": "bargauge", + "title": "Ariadne Task Errors (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "targets": [ + { + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "refId": "A", + "legendFormat": "{{task}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 16, + "type": "bargauge", "title": "Ariadne Access Requests", "datasource": { "type": "prometheus", @@ -874,7 +1033,7 @@ data: "h": 6, "w": 8, "x": 0, - "y": 34 + "y": 35 }, "targets": [ { @@ -937,7 +1096,7 @@ data: ] }, { - "id": 15, + "id": 17, "type": "stat", "title": "Ariadne CI Coverage (%)", "datasource": { @@ -948,7 +1107,7 @@ data: "h": 6, "w": 4, "x": 8, - "y": 34 + "y": 35 }, "targets": [ { @@ -1000,7 +1159,7 @@ data: } }, { - "id": 16, + "id": 18, "type": "table", "title": "Ariadne CI Tests (latest)", "datasource": { @@ -1011,7 +1170,7 @@ data: "h": 6, "w": 12, "x": 12, - "y": 34 + "y": 35 }, "targets": [ { @@ -1061,6 +1220,7 @@ data: "style": "dark", "tags": [ "atlas", - "testing" + "jobs", + "glue" ] } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 8ad75238..45969ccf 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -804,7 +804,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 0, "y": 8 @@ -871,7 +871,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 10, "y": 8 @@ -976,7 +976,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 5, "y": 8 @@ -1052,7 +1052,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 2, + "h": 3, "w": 5, "x": 15, "y": 8 @@ -1128,10 +1128,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 0, - "y": 10 + "y": 11 }, "targets": [ { @@ -1203,10 +1203,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 6, - "y": 10 + "y": 11 }, "targets": [ { @@ -1278,10 +1278,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 12, - "y": 10 + "y": 11 }, "targets": [ { @@ -1345,10 +1345,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 6, "x": 18, - "y": 10 + "y": 11 }, "targets": [ { @@ -1403,6 +1403,238 @@ data: } ] }, + { + "id": 40, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 8 + } + } + ] + }, + { + "id": 41, + "type": "timeseries", + "title": "Ariadne Attempts vs Failures (1h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 4, + "y": 16 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 42, + "type": "timeseries", + "title": "Ariadne Test Success Rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 12, + "y": 16 + }, + "targets": [ + { + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 43, + "type": "bargauge", + "title": "Tests with Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 20, + "y": 16 + }, + "targets": [ + { + "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "refId": "A", + "legendFormat": "{{result}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, { "id": 11, "type": "piechart", @@ -1415,7 +1647,7 @@ data: "h": 9, "w": 8, "x": 0, - "y": 16 + "y": 22 }, "targets": [ { @@ -1484,7 +1716,7 @@ data: "h": 9, "w": 8, "x": 8, - "y": 16 + "y": 22 }, "targets": [ { @@ -1553,7 +1785,7 @@ data: "h": 9, "w": 8, "x": 16, - "y": 16 + "y": 22 }, "targets": [ { @@ -1622,7 +1854,7 @@ data: "h": 12, "w": 12, "x": 0, - "y": 32 + "y": 38 }, "targets": [ { @@ -1669,7 +1901,7 @@ data: "h": 12, "w": 12, "x": 12, - "y": 32 + "y": 38 }, "targets": [ { @@ -1716,7 +1948,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 44 + "y": 50 }, "targets": [ { @@ -1753,7 +1985,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 44 + "y": 50 }, "targets": [ { @@ -1790,7 +2022,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 54 + "y": 60 }, "targets": [ { @@ -1841,7 +2073,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 54 + "y": 60 }, "targets": [ { @@ -1922,7 +2154,7 @@ data: "h": 7, "w": 8, "x": 0, - "y": 25 + "y": 31 }, "targets": [ { @@ -1966,7 +2198,7 @@ data: "h": 7, "w": 8, "x": 8, - "y": 25 + "y": 31 }, "targets": [ { @@ -2010,7 +2242,7 @@ data: "h": 7, "w": 8, "x": 16, - "y": 25 + "y": 31 }, "targets": [ { @@ -2054,7 +2286,7 @@ data: "h": 16, "w": 12, "x": 0, - "y": 64 + "y": 70 }, "targets": [ { @@ -2102,7 +2334,7 @@ data: "h": 16, "w": 12, "x": 12, - "y": 64 + "y": 70 }, "targets": [ { diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 02bc4821..ac24f8a0 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -471,14 +471,14 @@ spec: editable: true options: path: /var/lib/grafana/dashboards/mail - - name: testing + - name: jobs orgId: 1 folder: Atlas Internal type: file disableDeletion: false editable: true options: - path: /var/lib/grafana/dashboards/testing + path: /var/lib/grafana/dashboards/jobs dashboardsConfigMaps: overview: grafana-dashboard-overview overview-public: grafana-dashboard-overview @@ -488,7 +488,7 @@ spec: gpu: grafana-dashboard-gpu network: grafana-dashboard-network mail: grafana-dashboard-mail - testing: grafana-dashboard-testing + jobs: grafana-dashboard-jobs extraConfigmapMounts: - name: grafana-folders mountPath: /etc/grafana/provisioning/folders diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 86ab8269..59530390 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -14,7 +14,7 @@ resources: - grafana-dashboard-network.yaml - grafana-dashboard-gpu.yaml - grafana-dashboard-mail.yaml - - grafana-dashboard-testing.yaml + - grafana-dashboard-jobs.yaml - dcgm-exporter.yaml - jetson-tegrastats-exporter.yaml - postmark-exporter-service.yaml From a8e646f716d7548362e59bb61015370a004eddef Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 16:40:09 +0000 Subject: [PATCH 087/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 05f3be2f..6cb2acd4 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-16 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-17 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 2fd87aea45446dcb57cd8a1d371bd2574ffccf5b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 14:30:55 -0300 Subject: [PATCH 088/416] monitoring: refine jobs/overview panels --- scripts/dashboards_render_atlas.py | 162 ++++++++++++------ .../monitoring/dashboards/atlas-jobs.json | 119 ++++++++----- .../monitoring/dashboards/atlas-nodes.json | 4 +- .../monitoring/dashboards/atlas-overview.json | 135 +++++++++------ .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/grafana-dashboard-jobs.yaml | 119 ++++++++----- .../monitoring/grafana-dashboard-nodes.yaml | 4 +- .../grafana-dashboard-overview.yaml | 135 +++++++++------ .../monitoring/grafana-dashboard-pods.yaml | 2 +- 9 files changed, 446 insertions(+), 236 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 1235a0aa..3d581c70 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -70,6 +70,7 @@ WORKER_NODES = [ "titan-13", "titan-14", "titan-15", + "titan-16", "titan-17", "titan-18", "titan-19", @@ -333,9 +334,10 @@ GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})" GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)" GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})" GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})" -GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" -GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" -GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" +GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)" +GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)" +GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)" +ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))' ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))' @@ -344,10 +346,19 @@ ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_to ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))' -ARIADNE_TASK_ATTEMPTS_1H = 'sum(increase(ariadne_task_runs_total[1h]))' -ARIADNE_TASK_FAILURES_1H = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' +ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))' +ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))' +ARIADNE_TASK_WARNINGS_SERIES = ( + 'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)' +) ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" +ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = ( + "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600" +) +ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = ( + "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600" +) ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' @@ -370,6 +381,8 @@ ONEOFF_JOB_POD_AGE_HOURS = ( '* on(namespace,pod) group_left(phase) ' 'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})' ) +GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600" +GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600" GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -1032,7 +1045,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 3, "w": 5, "x": 0, "y": 8}, + {"h": 3, "w": 6, "x": 0, "y": 8}, unit="none", links=link_to("atlas-mail"), ) @@ -1043,7 +1056,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 3, "w": 5, "x": 10, "y": 8}, + "gridPos": {"h": 3, "w": 6, "x": 12, "y": 8}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1089,7 +1102,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 3, "w": 5, "x": 5, "y": 8}, + {"h": 3, "w": 6, "x": 6, "y": 8}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1101,7 +1114,7 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 3, "w": 5, "x": 15, "y": 8}, + {"h": 3, "w": 6, "x": 18, "y": 8}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, @@ -1121,7 +1134,7 @@ def build_overview(): panel_id, title, expr, - {"h": 5, "w": 6, "x": 6 * idx, "y": 11}, + {"h": 3, "w": 6, "x": 6 * idx, "y": 11}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, links=link_to("atlas-storage"), @@ -1133,26 +1146,44 @@ def build_overview(): 40, "One-off Job Pods (age hours)", ONEOFF_JOB_POD_AGE_HOURS, - {"h": 6, "w": 4, "x": 0, "y": 16}, + {"h": 6, "w": 6, "x": 0, "y": 14}, unit="h", instant=True, legend="{{namespace}}/{{pod}}", thresholds=age_thresholds, limit=8, + decimals=2, ) ) panels.append( { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": PROM_DS, - "gridPos": {"h": 6, "w": 8, "x": 4, "y": 16}, + "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14}, "targets": [ - {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, ], - "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "fieldConfig": { + "defaults": {"unit": "none"}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Warnings"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + ], + }, + { + "matcher": {"id": "byName", "options": "Failures"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}} + ], + }, + ], + }, "options": { "legend": {"displayMode": "table", "placement": "right"}, "tooltip": {"mode": "multi"}, @@ -1164,7 +1195,7 @@ def build_overview(): 42, "Ariadne Test Success Rate", ARIADNE_TEST_SUCCESS_RATE, - {"h": 6, "w": 8, "x": 12, "y": 16}, + {"h": 6, "w": 6, "x": 12, "y": 14}, unit="percent", legend=None, legend_display="list", @@ -1175,7 +1206,7 @@ def build_overview(): 43, "Tests with Failures (24h)", ARIADNE_TEST_FAILURES_24H, - {"h": 6, "w": 4, "x": 20, "y": 16}, + {"h": 6, "w": 6, "x": 18, "y": 14}, unit="none", instant=True, legend="{{result}}", @@ -1200,7 +1231,7 @@ def build_overview(): 11, "Namespace CPU Share", namespace_cpu_share_expr(cpu_scope), - {"h": 9, "w": 8, "x": 0, "y": 22}, + {"h": 9, "w": 8, "x": 0, "y": 20}, links=namespace_scope_links("namespace_scope_cpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1210,7 +1241,7 @@ def build_overview(): 12, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), - {"h": 9, "w": 8, "x": 8, "y": 22}, + {"h": 9, "w": 8, "x": 8, "y": 20}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1220,7 +1251,7 @@ def build_overview(): 13, "Namespace RAM Share", namespace_ram_share_expr(ram_scope), - {"h": 9, "w": 8, "x": 16, "y": 22}, + {"h": 9, "w": 8, "x": 16, "y": 20}, links=namespace_scope_links("namespace_scope_ram"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1232,7 +1263,7 @@ def build_overview(): 14, "Worker Node CPU", node_cpu_expr(worker_filter), - {"h": 12, "w": 12, "x": 0, "y": 38}, + {"h": 12, "w": 12, "x": 0, "y": 36}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1246,7 +1277,7 @@ def build_overview(): 15, "Worker Node RAM", node_mem_expr(worker_filter), - {"h": 12, "w": 12, "x": 12, "y": 38}, + {"h": 12, "w": 12, "x": 12, "y": 36}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1261,7 +1292,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 0, "y": 50}, + {"h": 10, "w": 12, "x": 0, "y": 48}, unit="percent", legend="{{node}}", legend_display="table", @@ -1273,7 +1304,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 12, "y": 50}, + {"h": 10, "w": 12, "x": 12, "y": 48}, unit="percent", legend="{{node}}", legend_display="table", @@ -1286,7 +1317,7 @@ def build_overview(): 28, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', - {"h": 10, "w": 12, "x": 0, "y": 60}, + {"h": 10, "w": 12, "x": 0, "y": 58}, ) ) panels.append( @@ -1294,7 +1325,7 @@ def build_overview(): 29, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', - {"h": 10, "w": 12, "x": 12, "y": 60}, + {"h": 10, "w": 12, "x": 12, "y": 58}, unit="none", limit=12, decimals=0, @@ -1316,7 +1347,7 @@ def build_overview(): 18, "Cluster Ingress Throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 8, "x": 0, "y": 31}, + {"h": 7, "w": 8, "x": 0, "y": 29}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", @@ -1329,7 +1360,7 @@ def build_overview(): 19, "Cluster Egress Throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 8, "x": 8, "y": 31}, + {"h": 7, "w": 8, "x": 8, "y": 29}, unit="Bps", legend="Egress (Traefik)", legend_display="list", @@ -1342,7 +1373,7 @@ def build_overview(): 20, "Intra-Cluster Throughput", NET_INTERNAL_EXPR, - {"h": 7, "w": 8, "x": 16, "y": 31}, + {"h": 7, "w": 8, "x": 16, "y": 29}, unit="Bps", legend="Internal traffic", legend_display="list", @@ -1356,7 +1387,7 @@ def build_overview(): 21, "Root Filesystem Usage", root_usage_expr(), - {"h": 16, "w": 12, "x": 0, "y": 70}, + {"h": 16, "w": 12, "x": 0, "y": 68}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1371,7 +1402,7 @@ def build_overview(): 22, "Nodes Closest to Full Root Disks", f"topk(12, {root_usage_expr()})", - {"h": 16, "w": 12, "x": 12, "y": 70}, + {"h": 16, "w": 12, "x": 12, "y": 68}, unit="percent", thresholds=PERCENT_THRESHOLDS, links=link_to("atlas-storage"), @@ -2300,9 +2331,9 @@ def build_jobs_dashboard(): panels.append( bargauge_panel( 1, - "Ariadne Task Errors (24h)", - ARIADNE_TASK_ERRORS_24H, - {"h": 7, "w": 6, "x": 0, "y": 0}, + "Ariadne Task Errors (range)", + ARIADNE_TASK_ERRORS_RANGE, + {"h": 7, "w": 8, "x": 0, "y": 0}, unit="none", instant=True, legend="{{task}}", @@ -2313,14 +2344,31 @@ def build_jobs_dashboard(): { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": PROM_DS, - "gridPos": {"h": 7, "w": 12, "x": 6, "y": 0}, + "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0}, "targets": [ - {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, ], - "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "fieldConfig": { + "defaults": {"unit": "none"}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Warnings"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + ], + }, + { + "matcher": {"id": "byName", "options": "Failures"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}} + ], + }, + ], + }, "options": { "legend": {"displayMode": "table", "placement": "right"}, "tooltip": {"mode": "multi"}, @@ -2332,12 +2380,13 @@ def build_jobs_dashboard(): 3, "One-off Job Pods (age hours)", ONEOFF_JOB_POD_AGE_HOURS, - {"h": 7, "w": 6, "x": 18, "y": 0}, + {"h": 7, "w": 8, "x": 16, "y": 0}, unit="h", instant=True, legend="{{namespace}}/{{pod}}", thresholds=age_thresholds, limit=12, + decimals=2, ) ) panels.append( @@ -2407,48 +2456,53 @@ def build_jobs_dashboard(): bargauge_panel( 10, "Ariadne Schedule Last Error (hours ago)", - ARIADNE_SCHEDULE_LAST_ERROR_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 11}, + ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS, + {"h": 6, "w": 12, "x": 0, "y": 17}, unit="h", instant=True, legend="{{task}}", thresholds=recent_error_thresholds, + sort_order="asc", + decimals=2, ) ) panels.append( bargauge_panel( 11, "Ariadne Schedule Last Success (hours ago)", - ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 11}, + ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS, + {"h": 6, "w": 12, "x": 12, "y": 17}, unit="h", instant=True, legend="{{task}}", thresholds=age_thresholds, + decimals=2, ) ) panels.append( bargauge_panel( 12, "Glue Jobs Last Success (hours ago)", - GLUE_LAST_SUCCESS_AGE_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 19}, + GLUE_LAST_SUCCESS_RANGE_HOURS, + {"h": 6, "w": 12, "x": 0, "y": 23}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", thresholds=age_thresholds, + decimals=2, ) ) panels.append( bargauge_panel( 13, "Glue Jobs Last Schedule (hours ago)", - GLUE_LAST_SCHEDULE_AGE_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 19}, + GLUE_LAST_SCHEDULE_RANGE_HOURS, + {"h": 6, "w": 12, "x": 12, "y": 23}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", thresholds=age_thresholds, + decimals=2, ) ) panels.append( @@ -2456,7 +2510,7 @@ def build_jobs_dashboard(): 14, "Ariadne Task Errors (1h)", ARIADNE_TASK_ERRORS_1H, - {"h": 8, "w": 12, "x": 0, "y": 27}, + {"h": 6, "w": 12, "x": 0, "y": 29}, unit="none", instant=True, legend="{{task}}", @@ -2468,7 +2522,7 @@ def build_jobs_dashboard(): 15, "Ariadne Task Errors (30d)", ARIADNE_TASK_ERRORS_30D, - {"h": 8, "w": 12, "x": 12, "y": 27}, + {"h": 6, "w": 12, "x": 12, "y": 29}, unit="none", instant=True, legend="{{task}}", @@ -2480,7 +2534,7 @@ def build_jobs_dashboard(): 16, "Ariadne Access Requests", ARIADNE_ACCESS_REQUESTS, - {"h": 6, "w": 8, "x": 0, "y": 35}, + {"h": 6, "w": 8, "x": 0, "y": 11}, unit="none", instant=True, legend="{{status}}", @@ -2491,7 +2545,7 @@ def build_jobs_dashboard(): 17, "Ariadne CI Coverage (%)", ARIADNE_CI_COVERAGE, - {"h": 6, "w": 4, "x": 8, "y": 35}, + {"h": 6, "w": 4, "x": 8, "y": 11}, unit="percent", decimals=1, instant=True, @@ -2503,7 +2557,7 @@ def build_jobs_dashboard(): 18, "Ariadne CI Tests (latest)", ARIADNE_CI_TESTS, - {"h": 6, "w": 12, "x": 12, "y": 35}, + {"h": 6, "w": 12, "x": 12, "y": 11}, unit="none", transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], instant=True, diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 76e21f01..c70e9c0f 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -7,20 +7,20 @@ { "id": 1, "type": "bargauge", - "title": "Ariadne Task Errors (24h)", + "title": "Ariadne Task Errors (range)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 6, + "w": 8, "x": 0, "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -81,26 +81,31 @@ { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 12, - "x": 6, + "w": 8, + "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -108,7 +113,38 @@ "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -130,8 +166,8 @@ }, "gridPos": { "h": 7, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 0 }, "targets": [ @@ -167,7 +203,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -216,7 +253,7 @@ }, "targets": [ { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)", "refId": "A" } ], @@ -284,7 +321,7 @@ }, "targets": [ { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)", "refId": "A" } ], @@ -344,7 +381,7 @@ }, "targets": [ { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)", "refId": "A" } ], @@ -577,14 +614,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -615,7 +652,8 @@ "value": 24 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -637,7 +675,7 @@ "fields": [ "Value" ], - "order": "desc" + "order": "asc" } } ] @@ -651,14 +689,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -689,7 +727,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -725,14 +764,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -763,7 +802,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -799,14 +839,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -837,7 +877,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -873,10 +914,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 27 + "y": 29 }, "targets": [ { @@ -947,10 +988,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 27 + "y": 29 }, "targets": [ { @@ -1024,7 +1065,7 @@ "h": 6, "w": 8, "x": 0, - "y": 35 + "y": 11 }, "targets": [ { @@ -1098,7 +1139,7 @@ "h": 6, "w": 4, "x": 8, - "y": 35 + "y": 11 }, "targets": [ { @@ -1161,7 +1202,7 @@ "h": 6, "w": 12, "x": 12, - "y": 35 + "y": 11 }, "targets": [ { diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index 2d60042b..ea595792 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -46,7 +46,7 @@ "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": "/19" + "valueSuffix": "/20" } }, "overrides": [] diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index c3ff327d..5acc2a3a 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -449,14 +449,14 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, - "max": 19, + "max": 20, "thresholds": { "mode": "absolute", "steps": [ @@ -466,15 +466,15 @@ }, { "color": "orange", - "value": 17 - }, - { - "color": "yellow", "value": 18 }, { - "color": "green", + "color": "yellow", "value": 19 + }, + { + "color": "green", + "value": 20 } ] } @@ -796,7 +796,7 @@ }, "gridPos": { "h": 3, - "w": 5, + "w": 6, "x": 0, "y": 8 }, @@ -863,8 +863,8 @@ }, "gridPos": { "h": 3, - "w": 5, - "x": 10, + "w": 6, + "x": 12, "y": 8 }, "targets": [ @@ -968,8 +968,8 @@ }, "gridPos": { "h": 3, - "w": 5, - "x": 5, + "w": 6, + "x": 6, "y": 8 }, "targets": [ @@ -1044,8 +1044,8 @@ }, "gridPos": { "h": 3, - "w": 5, - "x": 15, + "w": 6, + "x": 18, "y": 8 }, "targets": [ @@ -1119,7 +1119,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 0, "y": 11 @@ -1194,7 +1194,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 6, "y": 11 @@ -1269,7 +1269,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 12, "y": 11 @@ -1336,7 +1336,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 18, "y": 11 @@ -1404,9 +1404,9 @@ }, "gridPos": { "h": 6, - "w": 4, + "w": 6, "x": 0, - "y": 16 + "y": 14 }, "targets": [ { @@ -1441,7 +1441,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -1477,26 +1478,31 @@ { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, - "w": 8, - "x": 4, - "y": 16 + "w": 6, + "x": 6, + "y": 14 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -1504,7 +1510,38 @@ "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -1526,9 +1563,9 @@ }, "gridPos": { "h": 6, - "w": 8, + "w": 6, "x": 12, - "y": 16 + "y": 14 }, "targets": [ { @@ -1562,9 +1599,9 @@ }, "gridPos": { "h": 6, - "w": 4, - "x": 20, - "y": 16 + "w": 6, + "x": 18, + "y": 14 }, "targets": [ { @@ -1638,7 +1675,7 @@ "h": 9, "w": 8, "x": 0, - "y": 22 + "y": 20 }, "targets": [ { @@ -1707,7 +1744,7 @@ "h": 9, "w": 8, "x": 8, - "y": 22 + "y": 20 }, "targets": [ { @@ -1776,7 +1813,7 @@ "h": 9, "w": 8, "x": 16, - "y": 22 + "y": 20 }, "targets": [ { @@ -1845,11 +1882,11 @@ "h": 12, "w": 12, "x": 0, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1892,11 +1929,11 @@ "h": 12, "w": 12, "x": 12, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1939,7 +1976,7 @@ "h": 10, "w": 12, "x": 0, - "y": 50 + "y": 48 }, "targets": [ { @@ -1976,7 +2013,7 @@ "h": 10, "w": 12, "x": 12, - "y": 50 + "y": 48 }, "targets": [ { @@ -2013,7 +2050,7 @@ "h": 10, "w": 12, "x": 0, - "y": 60 + "y": 58 }, "targets": [ { @@ -2064,7 +2101,7 @@ "h": 10, "w": 12, "x": 12, - "y": 60 + "y": 58 }, "targets": [ { @@ -2145,7 +2182,7 @@ "h": 7, "w": 8, "x": 0, - "y": 31 + "y": 29 }, "targets": [ { @@ -2189,7 +2226,7 @@ "h": 7, "w": 8, "x": 8, - "y": 31 + "y": 29 }, "targets": [ { @@ -2233,7 +2270,7 @@ "h": 7, "w": 8, "x": 16, - "y": 31 + "y": 29 }, "targets": [ { @@ -2277,7 +2314,7 @@ "h": 16, "w": 12, "x": 0, - "y": 70 + "y": 68 }, "targets": [ { @@ -2325,7 +2362,7 @@ "h": 16, "w": 12, "x": 12, - "y": 70 + "y": 68 }, "targets": [ { diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index adab84bb..e36aa1fd 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -520,7 +520,7 @@ }, "targets": [ { - "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))", + "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))", "refId": "A", "instant": true, "format": "table" diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 19e0d4eb..36c12520 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -16,20 +16,20 @@ data: { "id": 1, "type": "bargauge", - "title": "Ariadne Task Errors (24h)", + "title": "Ariadne Task Errors (range)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 6, + "w": 8, "x": 0, "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -90,26 +90,31 @@ data: { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 12, - "x": 6, + "w": 8, + "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -117,7 +122,38 @@ data: "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -139,8 +175,8 @@ data: }, "gridPos": { "h": 7, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 0 }, "targets": [ @@ -176,7 +212,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -225,7 +262,7 @@ data: }, "targets": [ { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)", "refId": "A" } ], @@ -293,7 +330,7 @@ data: }, "targets": [ { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)", "refId": "A" } ], @@ -353,7 +390,7 @@ data: }, "targets": [ { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)", "refId": "A" } ], @@ -586,14 +623,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -624,7 +661,8 @@ data: "value": 24 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -646,7 +684,7 @@ data: "fields": [ "Value" ], - "order": "desc" + "order": "asc" } } ] @@ -660,14 +698,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -698,7 +736,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -734,14 +773,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -772,7 +811,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -808,14 +848,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -846,7 +886,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -882,10 +923,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 27 + "y": 29 }, "targets": [ { @@ -956,10 +997,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 27 + "y": 29 }, "targets": [ { @@ -1033,7 +1074,7 @@ data: "h": 6, "w": 8, "x": 0, - "y": 35 + "y": 11 }, "targets": [ { @@ -1107,7 +1148,7 @@ data: "h": 6, "w": 4, "x": 8, - "y": 35 + "y": 11 }, "targets": [ { @@ -1170,7 +1211,7 @@ data: "h": 6, "w": 12, "x": 12, - "y": 35 + "y": 11 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index f0f1982d..98123b96 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -55,7 +55,7 @@ data: "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": "/19" + "valueSuffix": "/20" } }, "overrides": [] diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 45969ccf..55196e8f 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -458,14 +458,14 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, - "max": 19, + "max": 20, "thresholds": { "mode": "absolute", "steps": [ @@ -475,15 +475,15 @@ data: }, { "color": "orange", - "value": 17 - }, - { - "color": "yellow", "value": 18 }, { - "color": "green", + "color": "yellow", "value": 19 + }, + { + "color": "green", + "value": 20 } ] } @@ -805,7 +805,7 @@ data: }, "gridPos": { "h": 3, - "w": 5, + "w": 6, "x": 0, "y": 8 }, @@ -872,8 +872,8 @@ data: }, "gridPos": { "h": 3, - "w": 5, - "x": 10, + "w": 6, + "x": 12, "y": 8 }, "targets": [ @@ -977,8 +977,8 @@ data: }, "gridPos": { "h": 3, - "w": 5, - "x": 5, + "w": 6, + "x": 6, "y": 8 }, "targets": [ @@ -1053,8 +1053,8 @@ data: }, "gridPos": { "h": 3, - "w": 5, - "x": 15, + "w": 6, + "x": 18, "y": 8 }, "targets": [ @@ -1128,7 +1128,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 0, "y": 11 @@ -1203,7 +1203,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 6, "y": 11 @@ -1278,7 +1278,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 12, "y": 11 @@ -1345,7 +1345,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 18, "y": 11 @@ -1413,9 +1413,9 @@ data: }, "gridPos": { "h": 6, - "w": 4, + "w": 6, "x": 0, - "y": 16 + "y": 14 }, "targets": [ { @@ -1450,7 +1450,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -1486,26 +1487,31 @@ data: { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, - "w": 8, - "x": 4, - "y": 16 + "w": 6, + "x": 6, + "y": 14 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -1513,7 +1519,38 @@ data: "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -1535,9 +1572,9 @@ data: }, "gridPos": { "h": 6, - "w": 8, + "w": 6, "x": 12, - "y": 16 + "y": 14 }, "targets": [ { @@ -1571,9 +1608,9 @@ data: }, "gridPos": { "h": 6, - "w": 4, - "x": 20, - "y": 16 + "w": 6, + "x": 18, + "y": 14 }, "targets": [ { @@ -1647,7 +1684,7 @@ data: "h": 9, "w": 8, "x": 0, - "y": 22 + "y": 20 }, "targets": [ { @@ -1716,7 +1753,7 @@ data: "h": 9, "w": 8, "x": 8, - "y": 22 + "y": 20 }, "targets": [ { @@ -1785,7 +1822,7 @@ data: "h": 9, "w": 8, "x": 16, - "y": 22 + "y": 20 }, "targets": [ { @@ -1854,11 +1891,11 @@ data: "h": 12, "w": 12, "x": 0, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1901,11 +1938,11 @@ data: "h": 12, "w": 12, "x": 12, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1948,7 +1985,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 50 + "y": 48 }, "targets": [ { @@ -1985,7 +2022,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 50 + "y": 48 }, "targets": [ { @@ -2022,7 +2059,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 60 + "y": 58 }, "targets": [ { @@ -2073,7 +2110,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 60 + "y": 58 }, "targets": [ { @@ -2154,7 +2191,7 @@ data: "h": 7, "w": 8, "x": 0, - "y": 31 + "y": 29 }, "targets": [ { @@ -2198,7 +2235,7 @@ data: "h": 7, "w": 8, "x": 8, - "y": 31 + "y": 29 }, "targets": [ { @@ -2242,7 +2279,7 @@ data: "h": 7, "w": 8, "x": 16, - "y": 31 + "y": 29 }, "targets": [ { @@ -2286,7 +2323,7 @@ data: "h": 16, "w": 12, "x": 0, - "y": 70 + "y": 68 }, "targets": [ { @@ -2334,7 +2371,7 @@ data: "h": 16, "w": 12, "x": 12, - "y": 70 + "y": 68 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index f537d4ca..62730238 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -529,7 +529,7 @@ data: }, "targets": [ { - "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))", + "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))", "refId": "A", "instant": true, "format": "table" From 9db260e482a8249dd86b54efb71029ee0d8f4d69 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 15:01:02 -0300 Subject: [PATCH 089/416] monitoring: tighten jobs/overview ordering --- scripts/dashboards_render_atlas.py | 18 +++++++++- .../monitoring/dashboards/atlas-jobs.json | 2 +- .../monitoring/dashboards/atlas-overview.json | 36 +++++++++++++++++-- .../monitoring/grafana-dashboard-jobs.yaml | 2 +- .../grafana-dashboard-overview.yaml | 36 +++++++++++++++++-- 5 files changed, 87 insertions(+), 7 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 3d581c70..c3f36550 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -560,6 +560,7 @@ def timeseries_panel( grid, *, unit="none", + max_value=None, legend=None, legend_display="table", legend_placement="bottom", @@ -584,6 +585,8 @@ def timeseries_panel( "tooltip": {"mode": "multi"}, }, } + if max_value is not None: + panel["fieldConfig"]["defaults"]["max"] = max_value if legend: panel["targets"][0]["legendFormat"] = legend if legend_calcs: @@ -742,6 +745,7 @@ def bargauge_panel( thresholds=None, decimals=None, instant=False, + overrides=None, ): """Return a bar gauge panel with label-aware reduction.""" panel = { @@ -786,6 +790,8 @@ def bargauge_panel( }, }, } + if overrides: + panel["fieldConfig"]["overrides"].extend(overrides) if decimals is not None: panel["fieldConfig"]["defaults"]["decimals"] = decimals if links: @@ -1197,6 +1203,7 @@ def build_overview(): ARIADNE_TEST_SUCCESS_RATE, {"h": 6, "w": 6, "x": 12, "y": 14}, unit="percent", + max_value=100, legend=None, legend_display="list", ) @@ -1210,6 +1217,16 @@ def build_overview(): unit="none", instant=True, legend="{{result}}", + overrides=[ + { + "matcher": {"id": "byName", "options": "error"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}], + }, + { + "matcher": {"id": "byName", "options": "failed"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}], + }, + ], thresholds={ "mode": "absolute", "steps": [ @@ -2462,7 +2479,6 @@ def build_jobs_dashboard(): instant=True, legend="{{task}}", thresholds=recent_error_thresholds, - sort_order="asc", decimals=2, ) ) diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index c70e9c0f..810b3b35 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -675,7 +675,7 @@ "fields": [ "Value" ], - "order": "asc" + "order": "desc" } } ] diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 5acc2a3a..3feb5311 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1575,7 +1575,8 @@ ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "max": 100 }, "overrides": [] }, @@ -1638,7 +1639,38 @@ ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "failed" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "displayMode": "gradient", diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 36c12520..279d959f 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -684,7 +684,7 @@ data: "fields": [ "Value" ], - "order": "asc" + "order": "desc" } } ] diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 55196e8f..66b6da0a 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1584,7 +1584,8 @@ data: ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "max": 100 }, "overrides": [] }, @@ -1647,7 +1648,38 @@ data: ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "failed" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "displayMode": "gradient", From e0308b89fdbaa3eff67389e8ed00e630be8fe325 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 15:12:53 -0300 Subject: [PATCH 090/416] monitoring: enforce sorted job lists --- scripts/dashboards_render_atlas.py | 24 ++++++++------ .../monitoring/dashboards/atlas-jobs.json | 31 ++++++++----------- .../monitoring/dashboards/atlas-overview.json | 21 +++++-------- .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/grafana-dashboard-jobs.yaml | 31 ++++++++----------- .../grafana-dashboard-overview.yaml | 21 +++++-------- .../monitoring/grafana-dashboard-pods.yaml | 2 +- 7 files changed, 58 insertions(+), 74 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index c3f36550..1f284895 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -748,6 +748,12 @@ def bargauge_panel( overrides=None, ): """Return a bar gauge panel with label-aware reduction.""" + cleaned_expr = expr.strip() + if not cleaned_expr.startswith(("sort(", "sort_desc(")): + if sort_order == "desc": + expr = f"sort_desc({expr})" + elif sort_order == "asc": + expr = f"sort({expr})" panel = { "id": panel_id, "type": "bargauge", @@ -1165,21 +1171,20 @@ def build_overview(): { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": PROM_DS, "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14}, "targets": [ {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, - {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, ], "fieldConfig": { "defaults": {"unit": "none"}, "overrides": [ { - "matcher": {"id": "byName", "options": "Warnings"}, + "matcher": {"id": "byName", "options": "Attempts"}, "properties": [ - {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}} ], }, { @@ -2361,21 +2366,20 @@ def build_jobs_dashboard(): { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": PROM_DS, "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0}, "targets": [ {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, - {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, ], "fieldConfig": { "defaults": {"unit": "none"}, "overrides": [ { - "matcher": {"id": "byName", "options": "Warnings"}, + "matcher": {"id": "byName", "options": "Attempts"}, "properties": [ - {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}} ], }, { diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 810b3b35..37b888d8 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -81,7 +81,7 @@ { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -98,14 +98,9 @@ "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -117,14 +112,14 @@ { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -172,7 +167,7 @@ }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -621,7 +616,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -696,7 +691,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -771,7 +766,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -846,7 +841,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -921,7 +916,7 @@ }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -995,7 +990,7 @@ }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -1069,7 +1064,7 @@ }, "targets": [ { - "expr": "ariadne_access_requests_total", + "expr": "sort_desc(ariadne_access_requests_total)", "refId": "A", "legendFormat": "{{status}}", "instant": true diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 3feb5311..78744dac 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1410,7 +1410,7 @@ }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -1478,7 +1478,7 @@ { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1495,14 +1495,9 @@ "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -1514,14 +1509,14 @@ { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -1606,7 +1601,7 @@ }, "targets": [ { - "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", "refId": "A", "legendFormat": "{{result}}", "instant": true @@ -2137,7 +2132,7 @@ }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -2398,7 +2393,7 @@ }, "targets": [ { - "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index e36aa1fd..0c8104c9 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -439,7 +439,7 @@ }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 279d959f..b16c9cbb 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -90,7 +90,7 @@ data: { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -107,14 +107,9 @@ data: "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -126,14 +121,14 @@ data: { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -181,7 +176,7 @@ data: }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -630,7 +625,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -705,7 +700,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -780,7 +775,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -855,7 +850,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -930,7 +925,7 @@ data: }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -1004,7 +999,7 @@ data: }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -1078,7 +1073,7 @@ data: }, "targets": [ { - "expr": "ariadne_access_requests_total", + "expr": "sort_desc(ariadne_access_requests_total)", "refId": "A", "legendFormat": "{{status}}", "instant": true diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 66b6da0a..fa19911f 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1419,7 +1419,7 @@ data: }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -1487,7 +1487,7 @@ data: { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1504,14 +1504,9 @@ data: "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -1523,14 +1518,14 @@ data: { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -1615,7 +1610,7 @@ data: }, "targets": [ { - "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", "refId": "A", "legendFormat": "{{result}}", "instant": true @@ -2146,7 +2141,7 @@ data: }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -2407,7 +2402,7 @@ data: }, "targets": [ { - "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 62730238..1461eac6 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -448,7 +448,7 @@ data: }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true From 03ad3374e1f2dd77c77f2174a14a1864fff7f172 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 20:04:15 +0000 Subject: [PATCH 091/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 78f5e685..e43f30ed 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 42ac893378a03b2cba453d04b7894f9ce07fe411 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 20:05:15 +0000 Subject: [PATCH 092/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index e43f30ed..ee57a11e 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 9a5421f5f956b4f3ed9e5ab510e794d979b738e1 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 20:33:18 +0000 Subject: [PATCH 093/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index ee57a11e..60180873 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From b748f6de2bb40c73cd4d16c266e9a1a13113c2b9 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 20:34:18 +0000 Subject: [PATCH 094/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 60180873..87cb6350 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 88e834cbe82db5006fa714335036c3a02d48beaf Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:05:29 +0000 Subject: [PATCH 095/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 87cb6350..9d4896b3 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From ea9a59d02daa14fc1edef4d321cdc0447f62cc3e Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:07:29 +0000 Subject: [PATCH 096/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 9d4896b3..8ba3cb03 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 922510ec4a97aeb198c9f5773f7d9836d437beb5 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:23:44 +0000 Subject: [PATCH 097/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 6cb2acd4..9b78f342 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-17 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-18 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 89b476eac3315914ec026d672b42dbf357b5b469 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:30:31 +0000 Subject: [PATCH 098/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 8ba3cb03..36decfa0 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 089548e2e3f1d64c3f0a96cf27e4865f05158a4a Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:32:31 +0000 Subject: [PATCH 099/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 36decfa0..9aa6d820 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From ea3666f2c3a067d0d75c2a6005e0f818d6692ea6 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:52:46 +0000 Subject: [PATCH 100/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 9b78f342..6c5ff2ec 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-18 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-19 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 9261415175c10b00f712918908af9b38d0a6b9ea Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:55:34 +0000 Subject: [PATCH 101/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 9aa6d820..52341a72 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From dbe54c795f651cb3344e531b0d605e52a3f74369 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 22:56:34 +0000 Subject: [PATCH 102/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 52341a72..e133abec 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 78300e028ee2b32a45557af035dcc764d2cd5e7d Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 23:24:37 +0000 Subject: [PATCH 103/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index e133abec..7e381abc 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From d033e4a0e310ac8889d7f0d17414aeea4052b2fc Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 23:24:40 +0000 Subject: [PATCH 104/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7e381abc..58688918 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 7d399fd01bb15441cc996e7967ac9eb14df56f7a Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 23:47:39 +0000 Subject: [PATCH 105/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 58688918..6f195140 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 0a6fa06fca6300933fb5bedd08f01d1385c8302e Mon Sep 17 00:00:00 2001 From: flux-bot Date: Wed, 21 Jan 2026 23:48:39 +0000 Subject: [PATCH 106/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 6f195140..94ccbced 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From fb66469e576c7ba7426aff5af3e9c7a170120579 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 00:16:42 +0000 Subject: [PATCH 107/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 94ccbced..e0137923 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 1cc241a8ea6d749cca2761b67bd9ec6cac6a6dc4 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 00:17:42 +0000 Subject: [PATCH 108/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index e0137923..d4a8429f 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From f8aa085326d979317cf01631a8ac8aa336a5c18b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 00:48:58 +0000 Subject: [PATCH 109/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 6c5ff2ec..84759a4d 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-19 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-20 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 7e3404b53853e16e0c0b098029dc005735c1c607 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 00:59:59 +0000 Subject: [PATCH 110/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 84759a4d..1f1c7316 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -25,7 +25,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-20 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 58c2af9e853c69cbaa131a16c3d91edb4954f6c4 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 05:37:20 +0000 Subject: [PATCH 111/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d4a8429f..db933330 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From c78b156cf5a68a725b0a50818e004942336036de Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 05:38:20 +0000 Subject: [PATCH 112/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index db933330..8e945e01 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 01e3b85321c1ab506066c47ab51fb23bd8682506 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 05:40:21 +0000 Subject: [PATCH 113/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 8e945e01..bf79e8bb 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From c4ad908edf0e6e393ea570988b530f7af42820a2 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 05:41:20 +0000 Subject: [PATCH 114/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index bf79e8bb..192ad7e3 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From ee4af80e15c11573a3bf22a5a15843a25cf96045 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 03:15:19 -0300 Subject: [PATCH 115/416] jenkins: use shared harbor creds when present --- services/jenkins/deployment.yaml | 6 ++++++ services/vault/scripts/vault_k8s_auth_configure.sh | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 0b62ee09..0dc76afd 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -34,6 +34,12 @@ spec: HARBOR_ROBOT_USERNAME={{ .Data.data.username }} HARBOR_ROBOT_PASSWORD={{ .Data.data.password }} {{ end }} + {{ with secret "kv/data/atlas/shared/harbor-pull" }} + {{- if and .Data.data.username .Data.data.password }} + HARBOR_ROBOT_USERNAME={{ .Data.data.username }} + HARBOR_ROBOT_PASSWORD={{ .Data.data.password }} + {{- end }} + {{ end }} {{ with secret "kv/data/atlas/jenkins/gitea-pat" }} GITEA_PAT_USERNAME={{ .Data.data.username }} GITEA_PAT_TOKEN={{ .Data.data.token }} diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index bc03cf4c..00fa567c 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -219,7 +219,7 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \ write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \ "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" "" write_policy_and_role "jenkins" "jenkins" "jenkins" \ - "jenkins/*" "" + "jenkins/* shared/harbor-pull" "" write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \ "monitoring/* shared/postmark-relay shared/harbor-pull" "" write_policy_and_role "logging" "logging" "logging-vault-sync" \ From 096bb329e69a47ffcf11946cd49a92ec9873626d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 04:45:24 -0300 Subject: [PATCH 116/416] jenkins: sync harbor pull secret from vault --- services/jenkins/kustomization.yaml | 3 ++ services/jenkins/secretproviderclass.yaml | 21 ++++++++++++ services/jenkins/vault-serviceaccount.yaml | 6 ++++ services/jenkins/vault-sync-deployment.yaml | 34 +++++++++++++++++++ .../vault/scripts/vault_k8s_auth_configure.sh | 2 +- 5 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 services/jenkins/secretproviderclass.yaml create mode 100644 services/jenkins/vault-serviceaccount.yaml create mode 100644 services/jenkins/vault-sync-deployment.yaml diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml index aab859ab..df519685 100644 --- a/services/jenkins/kustomization.yaml +++ b/services/jenkins/kustomization.yaml @@ -5,11 +5,14 @@ namespace: jenkins resources: - namespace.yaml - serviceaccount.yaml + - vault-serviceaccount.yaml - pvc.yaml - cache-pvc.yaml - plugins-pvc.yaml - configmap-jcasc.yaml - configmap-plugins.yaml + - secretproviderclass.yaml + - vault-sync-deployment.yaml - deployment.yaml - service.yaml - ingress.yaml diff --git a/services/jenkins/secretproviderclass.yaml b/services/jenkins/secretproviderclass.yaml new file mode 100644 index 00000000..a9d9dd50 --- /dev/null +++ b/services/jenkins/secretproviderclass.yaml @@ -0,0 +1,21 @@ +# services/jenkins/secretproviderclass.yaml +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: jenkins-vault + namespace: jenkins +spec: + provider: vault + parameters: + vaultAddress: "http://vault.vault.svc.cluster.local:8200" + roleName: "jenkins" + objects: | + - objectName: "harbor-pull__dockerconfigjson" + secretPath: "kv/data/atlas/shared/harbor-pull" + secretKey: "dockerconfigjson" + secretObjects: + - secretName: harbor-bstein-robot + type: kubernetes.io/dockerconfigjson + data: + - objectName: harbor-pull__dockerconfigjson + key: .dockerconfigjson diff --git a/services/jenkins/vault-serviceaccount.yaml b/services/jenkins/vault-serviceaccount.yaml new file mode 100644 index 00000000..8d314003 --- /dev/null +++ b/services/jenkins/vault-serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/jenkins/vault-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: jenkins-vault-sync + namespace: jenkins diff --git a/services/jenkins/vault-sync-deployment.yaml b/services/jenkins/vault-sync-deployment.yaml new file mode 100644 index 00000000..6de64f9e --- /dev/null +++ b/services/jenkins/vault-sync-deployment.yaml @@ -0,0 +1,34 @@ +# services/jenkins/vault-sync-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: jenkins-vault-sync + namespace: jenkins +spec: + replicas: 1 + selector: + matchLabels: + app: jenkins-vault-sync + template: + metadata: + labels: + app: jenkins-vault-sync + spec: + serviceAccountName: jenkins-vault-sync + containers: + - name: sync + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - "sleep infinity" + volumeMounts: + - name: vault-secrets + mountPath: /vault/secrets + readOnly: true + volumes: + - name: vault-secrets + csi: + driver: secrets-store.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: jenkins-vault diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index 00fa567c..a956e0e5 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -218,7 +218,7 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \ "nextcloud/* shared/keycloak-admin shared/postmark-relay" "" write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \ "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" "" -write_policy_and_role "jenkins" "jenkins" "jenkins" \ +write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \ "jenkins/* shared/harbor-pull" "" write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \ "monitoring/* shared/postmark-relay shared/harbor-pull" "" From daf8be2d43fb4468ebf9c715760a3c43a7ad16c6 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 04:47:50 -0300 Subject: [PATCH 117/416] vault: unsuspend k8s auth config cronjob --- services/vault/k8s-auth-config-cronjob.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml index e7cca14e..43da16b4 100644 --- a/services/vault/k8s-auth-config-cronjob.yaml +++ b/services/vault/k8s-auth-config-cronjob.yaml @@ -8,7 +8,7 @@ metadata: atlas.bstein.dev/glue: "true" spec: schedule: "*/15 * * * *" - suspend: true + suspend: false concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 From abb39d43281b9fdebf723a381f09d12dd9f59b82 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 10:56:27 -0300 Subject: [PATCH 118/416] jenkins: pin vault sync to worker nodes --- services/jenkins/vault-sync-deployment.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/jenkins/vault-sync-deployment.yaml b/services/jenkins/vault-sync-deployment.yaml index 6de64f9e..6abcacef 100644 --- a/services/jenkins/vault-sync-deployment.yaml +++ b/services/jenkins/vault-sync-deployment.yaml @@ -15,6 +15,9 @@ spec: app: jenkins-vault-sync spec: serviceAccountName: jenkins-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" containers: - name: sync image: alpine:3.20 From c985a45113c4b6ca63d2814584b97cae1bacbe6f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 12:41:58 -0300 Subject: [PATCH 119/416] keycloak: allow harbor direct grants --- .../harbor-oidc-secret-ensure-job.yaml | 2 +- .../scripts/harbor_oidc_secret_ensure.sh | 37 ++++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/services/keycloak/harbor-oidc-secret-ensure-job.yaml b/services/keycloak/harbor-oidc-secret-ensure-job.yaml index 8eac50d1..87de4632 100644 --- a/services/keycloak/harbor-oidc-secret-ensure-job.yaml +++ b/services/keycloak/harbor-oidc-secret-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: harbor-oidc-secret-ensure-9 + name: harbor-oidc-secret-ensure-10 namespace: sso spec: backoffLimit: 0 diff --git a/services/keycloak/scripts/harbor_oidc_secret_ensure.sh b/services/keycloak/scripts/harbor_oidc_secret_ensure.sh index 7187d343..c70caa28 100755 --- a/services/keycloak/scripts/harbor_oidc_secret_ensure.sh +++ b/services/keycloak/scripts/harbor_oidc_secret_ensure.sh @@ -29,7 +29,7 @@ CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)" if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then - create_payload='{"clientId":"harbor","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://registry.bstein.dev/c/oidc/callback"],"webOrigins":["https://registry.bstein.dev"],"rootUrl":"https://registry.bstein.dev","baseUrl":"/"}' + create_payload='{"clientId":"harbor","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":true,"serviceAccountsEnabled":false,"redirectUris":["https://registry.bstein.dev/c/oidc/callback"],"webOrigins":["https://registry.bstein.dev"],"rootUrl":"https://registry.bstein.dev","baseUrl":"/"}' status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \ -H "Authorization: Bearer ${ACCESS_TOKEN}" \ -H 'Content-Type: application/json' \ @@ -49,6 +49,21 @@ if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then exit 1 fi +CLIENT_CONFIG="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}" || true)" +if [ -n "$CLIENT_CONFIG" ]; then + updated_config="$(echo "$CLIENT_CONFIG" | jq '.directAccessGrantsEnabled=true')" + status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + -H 'Content-Type: application/json' \ + -d "${updated_config}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")" + if [ "$status" != "200" ] && [ "$status" != "204" ]; then + echo "Keycloak client update failed (status ${status})" >&2 + exit 1 + fi +fi + SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ "$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)" if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then @@ -77,6 +92,26 @@ if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2 fi fi +OFFLINE_SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/client-scopes?search=offline_access" | jq -r '.[] | select(.name=="offline_access") | .id' 2>/dev/null | head -n1 || true)" +if [ -n "$OFFLINE_SCOPE_ID" ] && [ "$OFFLINE_SCOPE_ID" != "null" ]; then + if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="offline_access")' >/dev/null 2>&1 \ + && ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="offline_access")' >/dev/null 2>&1; then + status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${OFFLINE_SCOPE_ID}")" + if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then + status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \ + -H "Authorization: Bearer ${ACCESS_TOKEN}" \ + "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${OFFLINE_SCOPE_ID}")" + if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then + echo "Failed to attach offline_access scope to harbor (status ${status})" >&2 + exit 1 + fi + fi + fi +fi + CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \ "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)" if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then From 1fc431af766846ea5ce4e23d5b4907fbfb5b4bf0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 13:26:38 -0300 Subject: [PATCH 120/416] harbor: route v2 ingress to registry --- services/harbor/helmrelease.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/services/harbor/helmrelease.yaml b/services/harbor/helmrelease.yaml index b0cbdbda..db017873 100644 --- a/services/harbor/helmrelease.yaml +++ b/services/harbor/helmrelease.yaml @@ -378,6 +378,16 @@ spec: subPath: app.conf - name: ca-download mountPath: /etc/core/ca + - target: + kind: Ingress + name: harbor-ingress + patch: |- + - op: replace + path: /spec/rules/0/http/paths/2/backend/service/name + value: harbor-registry + - op: replace + path: /spec/rules/0/http/paths/2/backend/service/port/number + value: 5000 - name: psc mountPath: /etc/core/token volumes: From fc0943b1a64e7237d9e43a1a1a8c16845795a05a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 13:31:12 -0300 Subject: [PATCH 121/416] harbor: fix ingress patch placement --- services/harbor/helmrelease.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/services/harbor/helmrelease.yaml b/services/harbor/helmrelease.yaml index db017873..16b81a8b 100644 --- a/services/harbor/helmrelease.yaml +++ b/services/harbor/helmrelease.yaml @@ -378,16 +378,6 @@ spec: subPath: app.conf - name: ca-download mountPath: /etc/core/ca - - target: - kind: Ingress - name: harbor-ingress - patch: |- - - op: replace - path: /spec/rules/0/http/paths/2/backend/service/name - value: harbor-registry - - op: replace - path: /spec/rules/0/http/paths/2/backend/service/port/number - value: 5000 - name: psc mountPath: /etc/core/token volumes: @@ -401,6 +391,16 @@ spec: $patch: delete - name: core-writable emptyDir: {} + - target: + kind: Ingress + name: harbor-ingress + patch: |- + - op: replace + path: /spec/rules/0/http/paths/2/backend/service/name + value: harbor-registry + - op: replace + path: /spec/rules/0/http/paths/2/backend/service/port/number + value: 5000 - target: kind: Deployment name: harbor-jobservice From fd8330a8ab789c8a469d41291220510e39879eeb Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 13:38:06 -0300 Subject: [PATCH 122/416] flux: temporarily drop harbor health checks --- .../atlas/flux-system/applications/harbor/kustomization.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml index 06baf268..5eec32fc 100644 --- a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml @@ -13,11 +13,6 @@ spec: kind: GitRepository name: flux-system namespace: flux-system - healthChecks: - - apiVersion: helm.toolkit.fluxcd.io/v2 - kind: HelmRelease - name: harbor - namespace: harbor wait: false dependsOn: - name: core From 373e33a178037481fdbe7a2a04972563127cb47e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 14:09:39 -0300 Subject: [PATCH 123/416] ops: pause portal/ariadne and add migrate jobs --- .../bstein-dev-home/backend-deployment.yaml | 16 ++++++- .../chat-ai-gateway-deployment.yaml | 2 +- .../bstein-dev-home/frontend-deployment.yaml | 2 +- services/bstein-dev-home/kustomization.yaml | 1 + .../bstein-dev-home/portal-migrate-job.yaml | 41 ++++++++++++++++++ .../vault-sync-deployment.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 16 ++++++- services/maintenance/ariadne-migrate-job.yaml | 42 +++++++++++++++++++ services/maintenance/kustomization.yaml | 1 + 9 files changed, 118 insertions(+), 5 deletions(-) create mode 100644 services/bstein-dev-home/portal-migrate-job.yaml create mode 100644 services/maintenance/ariadne-migrate-job.yaml diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 074a19d0..100c3ebc 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-backend namespace: bstein-dev-home spec: - replicas: 1 + replicas: 0 revisionHistoryLimit: 3 selector: matchLabels: @@ -99,6 +99,20 @@ spec: value: "" - name: HTTP_CHECK_TIMEOUT_SEC value: "2" + - name: PORTAL_DB_POOL_MIN + value: "0" + - name: PORTAL_DB_POOL_MAX + value: "5" + - name: PORTAL_DB_CONNECT_TIMEOUT_SEC + value: "5" + - name: PORTAL_DB_LOCK_TIMEOUT_SEC + value: "5" + - name: PORTAL_DB_STATEMENT_TIMEOUT_SEC + value: "30" + - name: PORTAL_DB_IDLE_IN_TX_TIMEOUT_SEC + value: "10" + - name: PORTAL_RUN_MIGRATIONS + value: "false" - name: ACCESS_REQUEST_SUBMIT_RATE_LIMIT value: "30" - name: ACCESS_REQUEST_SUBMIT_RATE_WINDOW_SEC diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml index 40d74fe1..3010a9b0 100644 --- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml +++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: chat-ai-gateway namespace: bstein-dev-home spec: - replicas: 1 + replicas: 0 revisionHistoryLimit: 2 selector: matchLabels: diff --git a/services/bstein-dev-home/frontend-deployment.yaml b/services/bstein-dev-home/frontend-deployment.yaml index ef26e73a..bbe5981a 100644 --- a/services/bstein-dev-home/frontend-deployment.yaml +++ b/services/bstein-dev-home/frontend-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-frontend namespace: bstein-dev-home spec: - replicas: 1 + replicas: 0 revisionHistoryLimit: 3 selector: matchLabels: diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 192ad7e3..28bbc3a8 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -15,6 +15,7 @@ resources: - frontend-service.yaml - backend-deployment.yaml - backend-service.yaml + - portal-migrate-job.yaml - vaultwarden-cred-sync-cronjob.yaml - portal-onboarding-e2e-test-job.yaml - ingress.yaml diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/portal-migrate-job.yaml new file mode 100644 index 00000000..303a04fc --- /dev/null +++ b/services/bstein-dev-home/portal-migrate-job.yaml @@ -0,0 +1,41 @@ +# services/bstein-dev-home/portal-migrate-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: bstein-dev-home-portal-migrate + namespace: bstein-dev-home +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + metadata: + labels: + app: bstein-dev-home-portal-migrate + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: "bstein-dev-home" + vault.hashicorp.com/agent-inject-secret-portal-env.sh: "kv/data/atlas/portal/atlas-portal-db" + vault.hashicorp.com/agent-inject-template-portal-env.sh: | + {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" + {{ end }} + spec: + serviceAccountName: bstein-dev-home + restartPolicy: Never + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + imagePullSecrets: + - name: harbor-regcred + containers: + - name: migrate + image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-95 + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - >- + . /vault/secrets/portal-env.sh + && exec python -m atlas_portal.migrate + env: + - name: PORTAL_RUN_MIGRATIONS + value: "true" diff --git a/services/bstein-dev-home/vault-sync-deployment.yaml b/services/bstein-dev-home/vault-sync-deployment.yaml index ad50f1e8..2f2ddbbe 100644 --- a/services/bstein-dev-home/vault-sync-deployment.yaml +++ b/services/bstein-dev-home/vault-sync-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-vault-sync namespace: bstein-dev-home spec: - replicas: 1 + replicas: 0 selector: matchLabels: app: bstein-dev-home-vault-sync diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 01e940cf..e11f8db2 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: ariadne namespace: maintenance spec: - replicas: 1 + replicas: 0 revisionHistoryLimit: 3 selector: matchLabels: @@ -129,6 +129,20 @@ spec: value: https://bstein.dev - name: ARIADNE_LOG_LEVEL value: INFO + - name: ARIADNE_DB_POOL_MIN + value: "0" + - name: ARIADNE_DB_POOL_MAX + value: "5" + - name: ARIADNE_DB_CONNECT_TIMEOUT_SEC + value: "5" + - name: ARIADNE_DB_LOCK_TIMEOUT_SEC + value: "5" + - name: ARIADNE_DB_STATEMENT_TIMEOUT_SEC + value: "30" + - name: ARIADNE_DB_IDLE_IN_TX_TIMEOUT_SEC + value: "10" + - name: ARIADNE_RUN_MIGRATIONS + value: "false" - name: PORTAL_ADMIN_USERS value: bstein - name: PORTAL_ADMIN_GROUPS diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml new file mode 100644 index 00000000..472cf5f5 --- /dev/null +++ b/services/maintenance/ariadne-migrate-job.yaml @@ -0,0 +1,42 @@ +# services/maintenance/ariadne-migrate-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: ariadne-migrate + namespace: maintenance +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + metadata: + labels: + app: ariadne-migrate + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: "maintenance" + vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" + vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | + {{ with secret "kv/data/atlas/maintenance/ariadne-db" }} + export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}" + {{ end }} + {{ with secret "kv/data/atlas/portal/atlas-portal-db" }} + export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}" + {{ end }} + spec: + serviceAccountName: ariadne + restartPolicy: Never + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: "true" + containers: + - name: migrate + image: registry.bstein.dev/bstein/ariadne:0.1.0-0 + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - >- + . /vault/secrets/ariadne-env.sh + && exec python -m ariadne.migrate + env: + - name: ARIADNE_RUN_MIGRATIONS + value: "true" diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 1f1c7316..c1350ebf 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -14,6 +14,7 @@ resources: - node-nofile-serviceaccount.yaml - pod-cleaner-rbac.yaml - ariadne-deployment.yaml + - ariadne-migrate-job.yaml - ariadne-service.yaml - disable-k3s-traefik-daemonset.yaml - k3s-traefik-cleanup-job.yaml From 307d1bf7a6e5d5875a48e6570922e449cccdd158 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 15:23:23 -0300 Subject: [PATCH 124/416] ops: restore portal/ariadne and add postgres panels --- scripts/dashboards_render_atlas.py | 36 ++++- .../bstein-dev-home/backend-deployment.yaml | 2 +- .../bstein-dev-home/frontend-deployment.yaml | 2 +- .../vault-sync-deployment.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 2 +- .../monitoring/dashboards/atlas-overview.json | 138 +++++++++++++++++- .../grafana-dashboard-overview.yaml | 138 +++++++++++++++++- 7 files changed, 298 insertions(+), 22 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 1f284895..f55896ad 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -371,6 +371,10 @@ ARIADNE_TEST_SUCCESS_RATE = ( ARIADNE_TEST_FAILURES_24H = ( 'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' ) +POSTGRES_CONN_USED_PCT = ( + "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)" +) +POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))' ONEOFF_JOB_OWNER = ( 'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")' ) @@ -1057,7 +1061,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 3, "w": 6, "x": 0, "y": 8}, + {"h": 3, "w": 4, "x": 0, "y": 8}, unit="none", links=link_to("atlas-mail"), ) @@ -1068,7 +1072,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 3, "w": 6, "x": 12, "y": 8}, + "gridPos": {"h": 3, "w": 4, "x": 8, "y": 8}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1114,7 +1118,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 3, "w": 6, "x": 6, "y": 8}, + {"h": 3, "w": 4, "x": 4, "y": 8}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1126,13 +1130,37 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 3, "w": 6, "x": 18, "y": 8}, + {"h": 3, "w": 4, "x": 12, "y": 8}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, links=link_to("atlas-mail"), ) ) + panels.append( + gauge_panel( + 34, + "Postgres Connections Used", + POSTGRES_CONN_USED_PCT, + {"h": 3, "w": 4, "x": 16, "y": 8}, + min_value=0, + max_value=100, + thresholds=PERCENT_THRESHOLDS, + ) + ) + panels.append( + stat_panel( + 35, + "Postgres Hottest Connections", + POSTGRES_CONN_HOTTEST, + {"h": 3, "w": 4, "x": 20, "y": 8}, + unit="none", + decimals=0, + text_mode="name_and_value", + legend="{{datname}}", + instant=True, + ) + ) storage_panels = [ (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 100c3ebc..2170396e 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-backend namespace: bstein-dev-home spec: - replicas: 0 + replicas: 1 revisionHistoryLimit: 3 selector: matchLabels: diff --git a/services/bstein-dev-home/frontend-deployment.yaml b/services/bstein-dev-home/frontend-deployment.yaml index bbe5981a..ef26e73a 100644 --- a/services/bstein-dev-home/frontend-deployment.yaml +++ b/services/bstein-dev-home/frontend-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-frontend namespace: bstein-dev-home spec: - replicas: 0 + replicas: 1 revisionHistoryLimit: 3 selector: matchLabels: diff --git a/services/bstein-dev-home/vault-sync-deployment.yaml b/services/bstein-dev-home/vault-sync-deployment.yaml index 2f2ddbbe..ad50f1e8 100644 --- a/services/bstein-dev-home/vault-sync-deployment.yaml +++ b/services/bstein-dev-home/vault-sync-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: bstein-dev-home-vault-sync namespace: bstein-dev-home spec: - replicas: 0 + replicas: 1 selector: matchLabels: app: bstein-dev-home-vault-sync diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index e11f8db2..581947c6 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: ariadne namespace: maintenance spec: - replicas: 0 + replicas: 1 revisionHistoryLimit: 3 selector: matchLabels: diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 78744dac..93a2d803 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -796,7 +796,7 @@ }, "gridPos": { "h": 3, - "w": 6, + "w": 4, "x": 0, "y": 8 }, @@ -863,8 +863,8 @@ }, "gridPos": { "h": 3, - "w": 6, - "x": 12, + "w": 4, + "x": 8, "y": 8 }, "targets": [ @@ -968,8 +968,8 @@ }, "gridPos": { "h": 3, - "w": 6, - "x": 6, + "w": 4, + "x": 4, "y": 8 }, "targets": [ @@ -1044,8 +1044,8 @@ }, "gridPos": { "h": 3, - "w": 6, - "x": 18, + "w": 4, + "x": 12, "y": 8 }, "targets": [ @@ -1110,6 +1110,130 @@ } ] }, + { + "id": 34, + "type": "gauge", + "title": "Postgres Connections Used", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 8 + }, + "targets": [ + { + "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false + } + }, + { + "id": 35, + "type": "stat", + "title": "Postgres Hottest Connections", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 8 + }, + "targets": [ + { + "expr": "topk(1, sum by (datname) (pg_stat_activity_count))", + "refId": "A", + "legendFormat": "{{datname}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + } + }, { "id": 23, "type": "stat", diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index fa19911f..0e9526ef 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -805,7 +805,7 @@ data: }, "gridPos": { "h": 3, - "w": 6, + "w": 4, "x": 0, "y": 8 }, @@ -872,8 +872,8 @@ data: }, "gridPos": { "h": 3, - "w": 6, - "x": 12, + "w": 4, + "x": 8, "y": 8 }, "targets": [ @@ -977,8 +977,8 @@ data: }, "gridPos": { "h": 3, - "w": 6, - "x": 6, + "w": 4, + "x": 4, "y": 8 }, "targets": [ @@ -1053,8 +1053,8 @@ data: }, "gridPos": { "h": 3, - "w": 6, - "x": 18, + "w": 4, + "x": 12, "y": 8 }, "targets": [ @@ -1119,6 +1119,130 @@ data: } ] }, + { + "id": 34, + "type": "gauge", + "title": "Postgres Connections Used", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 8 + }, + "targets": [ + { + "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false + } + }, + { + "id": 35, + "type": "stat", + "title": "Postgres Hottest Connections", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 8 + }, + "targets": [ + { + "expr": "topk(1, sum by (datname) (pg_stat_activity_count))", + "refId": "A", + "legendFormat": "{{datname}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + } + }, { "id": 23, "type": "stat", From 3e165975087dcb60d57f34b32b65f95b4e66cd8a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 15:28:26 -0300 Subject: [PATCH 125/416] ops: bump portal and ariadne image tags --- services/bstein-dev-home/kustomization.yaml | 4 ++-- services/maintenance/kustomization.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 28bbc3a8..7c431b29 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,9 +21,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index c1350ebf..992c8890 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 74458dd82e698b15050076cae3ccbaf19b6c4dcb Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:29:01 +0000 Subject: [PATCH 126/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7c431b29..28bbc3a8 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,9 +21,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From dc8238ec16769bce44f3f80980a5279774c34b79 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:29:24 +0000 Subject: [PATCH 127/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 992c8890..c1350ebf 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From c9e972539cb175f1108f0e42d02465b5c8e600ab Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 15:33:08 -0300 Subject: [PATCH 128/416] images: auth image scan and bump tags --- services/bstein-dev-home/image.yaml | 4 ++++ services/bstein-dev-home/kustomization.yaml | 4 ++-- services/maintenance/image.yaml | 2 ++ services/maintenance/kustomization.yaml | 2 +- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/services/bstein-dev-home/image.yaml b/services/bstein-dev-home/image.yaml index 3b6c7579..eed2736b 100644 --- a/services/bstein-dev-home/image.yaml +++ b/services/bstein-dev-home/image.yaml @@ -7,6 +7,8 @@ metadata: spec: image: registry.bstein.dev/bstein/bstein-dev-home-frontend interval: 1m0s + secretRef: + name: harbor-regcred --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImagePolicy @@ -28,6 +30,8 @@ metadata: spec: image: registry.bstein.dev/bstein/bstein-dev-home-backend interval: 1m0s + secretRef: + name: harbor-regcred --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImagePolicy diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 28bbc3a8..7c431b29 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,9 +21,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home diff --git a/services/maintenance/image.yaml b/services/maintenance/image.yaml index 95acbd0b..fd28d902 100644 --- a/services/maintenance/image.yaml +++ b/services/maintenance/image.yaml @@ -7,6 +7,8 @@ metadata: spec: image: registry.bstein.dev/bstein/ariadne interval: 1m0s + secretRef: + name: harbor-regcred --- apiVersion: image.toolkit.fluxcd.io/v1beta2 kind: ImagePolicy diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index c1350ebf..992c8890 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From aa8e20470cdd8129bf814154746bd544f4563be8 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:33:30 +0000 Subject: [PATCH 129/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 992c8890..c1350ebf 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 518b4dba4f9094e76557ba385a1963d9f1dbe32f Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:33:48 +0000 Subject: [PATCH 130/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index c1350ebf..992c8890 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 53ad965a6e9c3f262dc48ddab98ba210aa4ed194 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:34:08 +0000 Subject: [PATCH 131/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7c431b29..28bbc3a8 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,9 +21,9 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 1018e08d549e4b693b5c4a90eccaf29d6a09be4d Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:35:15 +0000 Subject: [PATCH 132/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 28bbc3a8..8bfc8a5e 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 834c6d275c8cc74287e7809968f7d4964bbfe605 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:35:20 +0000 Subject: [PATCH 133/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 8bfc8a5e..7c431b29 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From e6083868812de98ea806fa43a6cf4d754647477f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 15:39:57 -0300 Subject: [PATCH 134/416] jobs: force recreate migrate jobs --- services/bstein-dev-home/portal-migrate-job.yaml | 2 ++ services/maintenance/ariadne-migrate-job.yaml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/portal-migrate-job.yaml index 303a04fc..a578b8c8 100644 --- a/services/bstein-dev-home/portal-migrate-job.yaml +++ b/services/bstein-dev-home/portal-migrate-job.yaml @@ -4,6 +4,8 @@ kind: Job metadata: name: bstein-dev-home-portal-migrate namespace: bstein-dev-home + annotations: + kustomize.toolkit.fluxcd.io/force: "true" spec: backoffLimit: 1 ttlSecondsAfterFinished: 3600 diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml index 472cf5f5..3528f9be 100644 --- a/services/maintenance/ariadne-migrate-job.yaml +++ b/services/maintenance/ariadne-migrate-job.yaml @@ -4,6 +4,8 @@ kind: Job metadata: name: ariadne-migrate namespace: maintenance + annotations: + kustomize.toolkit.fluxcd.io/force: "true" spec: backoffLimit: 1 ttlSecondsAfterFinished: 3600 From d286950b6d36d09c0c255c0419c6bea7ef2045df Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:47:16 +0000 Subject: [PATCH 135/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7c431b29..3075a664 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From d255483c81eb4d3398317182b22286695b70550e Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 18:48:16 +0000 Subject: [PATCH 136/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 3075a664..c03f2c76 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From c2e34bfaa003ad90734677e07fd90d3319b4a68e Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:00:29 +0000 Subject: [PATCH 137/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index c03f2c76..38b7c40c 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 47e96bf45a8ca4c63e80045d9c18c02981252906 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:00:34 +0000 Subject: [PATCH 138/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 38b7c40c..4eaed54c 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 47f77b3a3ce2f9e060d8f42c55b74d514131afd1 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:02:01 +0000 Subject: [PATCH 139/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 992c8890..2de807e9 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-37 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 6c5a6c030d2231c25c4a31eda8b077bc2bac0abe Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 17:58:53 -0300 Subject: [PATCH 140/416] jenkins: set timezone to America/Chicago --- services/jenkins/deployment.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml index 0dc76afd..63f722bd 100644 --- a/services/jenkins/deployment.yaml +++ b/services/jenkins/deployment.yaml @@ -108,7 +108,9 @@ spec: containerPort: 50000 env: - name: JAVA_OPTS - value: "-Xms512m -Xmx2048m" + value: "-Xms512m -Xmx2048m -Duser.timezone=America/Chicago" + - name: TZ + value: "America/Chicago" - name: JENKINS_OPTS value: "--webroot=/var/jenkins_cache/war" - name: JENKINS_SLAVE_AGENT_PORT From 8b8766b0f090545313b3daf4e1eb4acda5eaeaae Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 22 Jan 2026 18:23:17 -0300 Subject: [PATCH 141/416] monitoring: add postgres metrics and update overview --- infrastructure/postgres/service.yaml | 8 ++++ infrastructure/postgres/statefulset.yaml | 17 ++++++++ scripts/dashboards_render_atlas.py | 16 +++---- .../monitoring/dashboards/atlas-overview.json | 42 ++++++++++--------- .../grafana-dashboard-overview.yaml | 42 ++++++++++--------- 5 files changed, 78 insertions(+), 47 deletions(-) diff --git a/infrastructure/postgres/service.yaml b/infrastructure/postgres/service.yaml index 3dcab3c2..b695045f 100644 --- a/infrastructure/postgres/service.yaml +++ b/infrastructure/postgres/service.yaml @@ -4,6 +4,10 @@ kind: Service metadata: name: postgres-service namespace: postgres + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9187" + prometheus.io/path: "/metrics" spec: clusterIP: None ports: @@ -11,5 +15,9 @@ spec: port: 5432 protocol: TCP targetPort: 5432 + - name: metrics + port: 9187 + protocol: TCP + targetPort: 9187 selector: app: postgres diff --git a/infrastructure/postgres/statefulset.yaml b/infrastructure/postgres/statefulset.yaml index e1a19214..2c792486 100644 --- a/infrastructure/postgres/statefulset.yaml +++ b/infrastructure/postgres/statefulset.yaml @@ -58,6 +58,23 @@ spec: - name: vault-secrets mountPath: /mnt/vault readOnly: true + - name: postgres-exporter + image: quay.io/prometheuscommunity/postgres-exporter:v0.15.0 + ports: + - name: metrics + containerPort: 9187 + protocol: TCP + env: + - name: DATA_SOURCE_URI + value: "localhost:5432/postgres?sslmode=disable" + - name: DATA_SOURCE_USER + value: postgres + - name: DATA_SOURCE_PASS_FILE + value: /mnt/vault/postgres_password + volumeMounts: + - name: vault-secrets + mountPath: /mnt/vault + readOnly: true volumes: - name: vault-secrets csi: diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index f55896ad..11479d9d 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -371,8 +371,9 @@ ARIADNE_TEST_SUCCESS_RATE = ( ARIADNE_TEST_FAILURES_24H = ( 'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' ) -POSTGRES_CONN_USED_PCT = ( - "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)" +POSTGRES_CONN_USED = ( + 'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") ' + 'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")' ) POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))' ONEOFF_JOB_OWNER = ( @@ -1138,14 +1139,15 @@ def build_overview(): ) ) panels.append( - gauge_panel( + stat_panel( 34, "Postgres Connections Used", - POSTGRES_CONN_USED_PCT, + POSTGRES_CONN_USED, {"h": 3, "w": 4, "x": 16, "y": 8}, - min_value=0, - max_value=100, - thresholds=PERCENT_THRESHOLDS, + decimals=0, + text_mode="name_and_value", + legend="{{conn}}", + instant=True, ) ) panels.append( diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 93a2d803..2d7f3e51 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1112,7 +1112,7 @@ }, { "id": 34, - "type": "gauge", + "type": "stat", "title": "Postgres Connections Used", "datasource": { "type": "prometheus", @@ -1126,39 +1126,43 @@ }, "targets": [ { - "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)", - "refId": "A" + "expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")", + "refId": "A", + "legendFormat": "{{conn}}", + "instant": true } ], "fieldConfig": { "defaults": { - "min": 0, - "max": 100, + "color": { + "mode": "thresholds" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 75 - }, - { - "color": "red", - "value": 91.5 + "color": "green", + "value": 1 } ] - } + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -1166,9 +1170,7 @@ "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "name_and_value" } }, { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 0e9526ef..53361345 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1121,7 +1121,7 @@ data: }, { "id": 34, - "type": "gauge", + "type": "stat", "title": "Postgres Connections Used", "datasource": { "type": "prometheus", @@ -1135,39 +1135,43 @@ data: }, "targets": [ { - "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)", - "refId": "A" + "expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")", + "refId": "A", + "legendFormat": "{{conn}}", + "instant": true } ], "fieldConfig": { "defaults": { - "min": 0, - "max": 100, + "color": { + "mode": "thresholds" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 75 - }, - { - "color": "red", - "value": 91.5 + "color": "green", + "value": 1 } ] - } + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -1175,9 +1179,7 @@ data: "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "name_and_value" } }, { From 4f76e7879c7e426230969360187e002f500eee1b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:41:04 +0000 Subject: [PATCH 142/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 2de807e9..6f5b7dcb 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-37 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-38 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 9833c839da32e31077bb9f0d90dad089f53ceaa4 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:51:32 +0000 Subject: [PATCH 143/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 4eaed54c..cebb191a 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From bf5b6a0cc40c1de0bd3bffaa34bd3637c709ca78 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 21:53:32 +0000 Subject: [PATCH 144/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index cebb191a..3ff70ab0 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 761f4388ffb2584302e4a80461a0e96b9379cedc Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 22:08:33 +0000 Subject: [PATCH 145/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 3ff70ab0..4e811e0a 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 0249122ed1c7303b2134ba330c80bb876ad201c8 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 22:08:37 +0000 Subject: [PATCH 146/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 4e811e0a..7dbfa1c5 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 8024efd9addceef7c8adaf60444a35461c75619d Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 22:16:34 +0000 Subject: [PATCH 147/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7dbfa1c5..200ee58d 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From d0b9e6dbd4183401e1a79b3f492036f24e31d7e8 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Thu, 22 Jan 2026 22:16:37 +0000 Subject: [PATCH 148/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 200ee58d..d4f2e028 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From a4b35bc3bc79bb5b6adcb0ddbe5d506fd8671f13 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:07:49 +0000 Subject: [PATCH 149/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d4f2e028..459c63dd 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 9bb9bd9a09659e02d821d49f272b7af56bc35f42 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:08:49 +0000 Subject: [PATCH 150/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 459c63dd..d9fa7c01 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 780c7d450300ff47bdc63d6a92ea4ec33789222d Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:12:49 +0000 Subject: [PATCH 151/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d9fa7c01..f651a921 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 130323cae1514086545b00a369b36a7f113685df Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:14:49 +0000 Subject: [PATCH 152/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f651a921..78f1cae8 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 24e837ef729799eb9b0bccc1bc97a2e572ba589b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:32:51 +0000 Subject: [PATCH 153/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 78f1cae8..ae77c9af 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-128 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 0780e5a15531d4df065d8f11a3aa07b705cb43ee Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:42:52 +0000 Subject: [PATCH 154/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index ae77c9af..26b85365 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-128 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-129 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From b598e5baa29228ebfa59c0790fca7c8cc788c158 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:51:53 +0000 Subject: [PATCH 155/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 26b85365..48f5bf7d 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-129 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 100add5544c981696a122911dbfc442ad9bf515a Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 01:52:53 +0000 Subject: [PATCH 156/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 48f5bf7d..b5f5319a 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 60a705b94c0ad5a4fe6ed8c775f05f8250cb2ffe Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 02:46:57 +0000 Subject: [PATCH 157/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index b5f5319a..d2512be1 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From f54b15cdb975cba4c19f37d1b5699380650bb5fb Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 02:47:58 +0000 Subject: [PATCH 158/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d2512be1..f36c3178 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 0942905a4d24841dfe2f5ef54a2b960af13c5f5f Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:01:59 +0000 Subject: [PATCH 159/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f36c3178..912cd1f2 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 29ed8defa87177271fac2490ab615a3cc69b2b91 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:02:59 +0000 Subject: [PATCH 160/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 912cd1f2..8b47e2ed 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 4099563792738054a5626a396553bff344a542c9 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:10:59 +0000 Subject: [PATCH 161/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 8b47e2ed..c83d9f3e 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From eb9fe085e5207d4a99a61b7d497eeb243f0df14b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:11:03 +0000 Subject: [PATCH 162/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index c83d9f3e..81931f2a 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From e20d3b8fa808500683b1db8c25b0890d264af617 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:38:02 +0000 Subject: [PATCH 163/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 81931f2a..aab9154d 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 0b4f094db47667da202ca796c23a348368da6c9c Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 03:39:02 +0000 Subject: [PATCH 164/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index aab9154d..45a2d815 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 6b83dc4729d66182390928895bb7b19541aab48a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 01:35:15 -0300 Subject: [PATCH 165/416] comms: enable MSC4108 rendezvous in synapse --- services/comms/helmrelease.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/comms/helmrelease.yaml b/services/comms/helmrelease.yaml index 4456348c..e6536fa4 100644 --- a/services/comms/helmrelease.yaml +++ b/services/comms/helmrelease.yaml @@ -138,6 +138,8 @@ spec: auto_join_rooms: - "#othrys:live.bstein.dev" autocreate_auto_join_rooms: true + experimental: + msc4108_enabled: true default_room_version: "11" experimental_features: msc3266_enabled: true From 9ded5a75f06e5120b31486453a4023c4350474fc Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 01:46:03 -0300 Subject: [PATCH 166/416] comms: enable MSC4108 under experimental_features --- services/comms/helmrelease.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/services/comms/helmrelease.yaml b/services/comms/helmrelease.yaml index e6536fa4..eeac49e8 100644 --- a/services/comms/helmrelease.yaml +++ b/services/comms/helmrelease.yaml @@ -138,10 +138,9 @@ spec: auto_join_rooms: - "#othrys:live.bstein.dev" autocreate_auto_join_rooms: true - experimental: - msc4108_enabled: true default_room_version: "11" experimental_features: + msc4108_enabled: true msc3266_enabled: true msc4143_enabled: true msc4222_enabled: true From 8bd58f703443c0808dded7afbdeb3fbd41d63e11 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 02:04:51 -0300 Subject: [PATCH 167/416] comms/keycloak: add mailu email claim --- services/comms/mas-configmap.yaml | 2 +- services/keycloak/realm-settings-job.yaml | 47 +++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/services/comms/mas-configmap.yaml b/services/comms/mas-configmap.yaml index 5e6cfdd1..9d2c11ea 100644 --- a/services/comms/mas-configmap.yaml +++ b/services/comms/mas-configmap.yaml @@ -72,7 +72,7 @@ data: template: "{{ user.name }}" email: action: force - template: "{{ user.email }}" + template: "{{ user.mailu_email }}" policy: data: diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index 6e6589de..e94076c6 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -542,6 +542,53 @@ spec: if status not in (201, 204): raise SystemExit(f"Unexpected mailu email mapper create response: {status}") + mailu_claim_mapper = { + "name": "mailu-email-claim", + "protocol": "openid-connect", + "protocolMapper": "oidc-usermodel-attribute-mapper", + "consentRequired": False, + "config": { + "user.attribute": "mailu_email", + "claim.name": "mailu_email", + "jsonType.label": "String", + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true", + "multivalued": "false", + "aggregate.attrs": "false", + }, + } + status, mappers = http_json( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + ) + existing_claim = None + if status == 200 and isinstance(mappers, list): + for item in mappers: + if isinstance(item, dict) and item.get("name") == mailu_claim_mapper["name"]: + existing_claim = item + break + if existing_claim and existing_claim.get("id"): + mailu_claim_mapper["id"] = existing_claim["id"] + status, _ = http_json( + "PUT", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models/{existing_claim['id']}", + access_token, + mailu_claim_mapper, + ) + if status not in (200, 204): + raise SystemExit(f"Unexpected mailu email claim mapper update response: {status}") + else: + status, _ = http_json( + "POST", + f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models", + access_token, + mailu_claim_mapper, + ) + if status not in (201, 204): + raise SystemExit(f"Unexpected mailu email claim mapper create response: {status}") + # Ensure MFA is on by default for newly-created users. status, required_actions = http_json( "GET", From 4594255cb2a4d0e6edfcfa44b8af689b7d7b0c62 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 02:09:53 -0300 Subject: [PATCH 168/416] keycloak: bump realm settings job --- services/keycloak/realm-settings-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index e94076c6..0de48d1e 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: keycloak-realm-settings-34 + name: keycloak-realm-settings-35 namespace: sso spec: backoffLimit: 0 From 7ebbcdb914b9359576c7721bb31c52fddc5653eb Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 03:11:42 -0300 Subject: [PATCH 169/416] portal: bump migrate job name --- services/bstein-dev-home/portal-migrate-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/portal-migrate-job.yaml index a578b8c8..2cb2a12e 100644 --- a/services/bstein-dev-home/portal-migrate-job.yaml +++ b/services/bstein-dev-home/portal-migrate-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: bstein-dev-home-portal-migrate + name: bstein-dev-home-portal-migrate-36 namespace: bstein-dev-home annotations: kustomize.toolkit.fluxcd.io/force: "true" From 21ee6cee79876d251318c77698cba1e43a985c49 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 06:13:15 +0000 Subject: [PATCH 170/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 45a2d815..41ad3e5b 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -21,7 +21,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From c929e9499d0c9d050b233f2847240f3a78d9a3a7 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 06:14:16 +0000 Subject: [PATCH 171/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 41ad3e5b..ea326a22 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -23,7 +23,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 8da007759bf866b4e67ea336bb61751dc11e4f19 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 03:28:26 -0300 Subject: [PATCH 172/416] bstein-dev-home: separate portal migrations --- .../kustomization.yaml | 16 ++++++++++++++++ .../flux-system/applications/kustomization.yaml | 1 + services/bstein-dev-home/kustomization.yaml | 1 - .../migrations/kustomization.yaml | 6 ++++++ .../{ => migrations}/portal-migrate-job.yaml | 2 +- 5 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml create mode 100644 services/bstein-dev-home/migrations/kustomization.yaml rename services/bstein-dev-home/{ => migrations}/portal-migrate-job.yaml (95%) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml new file mode 100644 index 00000000..f962de0a --- /dev/null +++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml @@ -0,0 +1,16 @@ +# clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: bstein-dev-home-migrations + namespace: flux-system +spec: + interval: 10m + path: ./services/bstein-dev-home/migrations + prune: true + sourceRef: + kind: GitRepository + name: flux-system + targetNamespace: bstein-dev-home + wait: false + suspend: true diff --git a/clusters/atlas/flux-system/applications/kustomization.yaml b/clusters/atlas/flux-system/applications/kustomization.yaml index 417a3ec3..10c203d8 100644 --- a/clusters/atlas/flux-system/applications/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/kustomization.yaml @@ -12,6 +12,7 @@ resources: - pegasus/image-automation.yaml - bstein-dev-home/kustomization.yaml - bstein-dev-home/image-automation.yaml + - bstein-dev-home-migrations/kustomization.yaml - harbor/kustomization.yaml - harbor/image-automation.yaml - jellyfin/kustomization.yaml diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index ea326a22..e6a744f7 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -15,7 +15,6 @@ resources: - frontend-service.yaml - backend-deployment.yaml - backend-service.yaml - - portal-migrate-job.yaml - vaultwarden-cred-sync-cronjob.yaml - portal-onboarding-e2e-test-job.yaml - ingress.yaml diff --git a/services/bstein-dev-home/migrations/kustomization.yaml b/services/bstein-dev-home/migrations/kustomization.yaml new file mode 100644 index 00000000..067665bc --- /dev/null +++ b/services/bstein-dev-home/migrations/kustomization.yaml @@ -0,0 +1,6 @@ +# services/bstein-dev-home/migrations/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: bstein-dev-home +resources: + - portal-migrate-job.yaml diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/migrations/portal-migrate-job.yaml similarity index 95% rename from services/bstein-dev-home/portal-migrate-job.yaml rename to services/bstein-dev-home/migrations/portal-migrate-job.yaml index 2cb2a12e..9d052546 100644 --- a/services/bstein-dev-home/portal-migrate-job.yaml +++ b/services/bstein-dev-home/migrations/portal-migrate-job.yaml @@ -1,4 +1,4 @@ -# services/bstein-dev-home/portal-migrate-job.yaml +# services/bstein-dev-home/migrations/portal-migrate-job.yaml apiVersion: batch/v1 kind: Job metadata: From 4984147fac48773cbaf34f16a348f349ebaad757 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 06:44:18 +0000 Subject: [PATCH 173/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index e6a744f7..f705c4e7 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From c373b953c27aa7e480dc3302dacd19765ed766d1 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 06:45:19 +0000 Subject: [PATCH 174/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f705c4e7..94239e33 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 993702afeedb71ac45793adadb4fc03bd04bc500 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 11:50:55 -0300 Subject: [PATCH 175/416] monitoring: alert on VM outage --- .../vault-csi/secrets-store-csi-driver.yaml | 3 +- .../monitoring/grafana-alerting-config.yaml | 53 +++++++++++++++++++ services/monitoring/helmrelease.yaml | 2 +- 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/infrastructure/vault-csi/secrets-store-csi-driver.yaml b/infrastructure/vault-csi/secrets-store-csi-driver.yaml index 0b249fc9..0004c0d5 100644 --- a/infrastructure/vault-csi/secrets-store-csi-driver.yaml +++ b/infrastructure/vault-csi/secrets-store-csi-driver.yaml @@ -17,4 +17,5 @@ spec: values: syncSecret: enabled: true - enableSecretRotation: false + enableSecretRotation: true + rotationPollInterval: 2m diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index daa1e29a..8713d3db 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -180,6 +180,59 @@ data: summary: "{{ $labels.instance }} CPU >90% for 10m" labels: severity: warning + - orgId: 1 + name: atlas-metrics + folder: Alerts + interval: 1m + rules: + - uid: victoria-metrics-down + title: "VictoriaMetrics unavailable (>30m)" + condition: C + for: "30m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: sum(up{job="victoriametrics"}) + legendFormat: victoriametrics + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: Alerting + execErrState: Alerting + annotations: + summary: "VictoriaMetrics is unavailable for >30m" + labels: + severity: critical - orgId: 1 name: maintenance folder: Alerts diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index ac24f8a0..8e225d49 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -342,7 +342,7 @@ spec: GF_SMTP_HOST: "mail.bstein.dev:587" GF_SMTP_FROM: "no-reply-grafana@bstein.dev" GF_SMTP_FROM_NAME: "Atlas Grafana" - GRAFANA_ALERT_EMAILS: "alerts@bstein.dev" + GRAFANA_ALERT_EMAILS: "brad@bstein.dev" GF_SECURITY_ALLOW_EMBEDDING: "true" GF_AUTH_GENERIC_OAUTH_ENABLED: "true" GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak" From af112d9dfa1ae27f2a7402cdf509146ce0b83484 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 14:07:52 -0300 Subject: [PATCH 176/416] finance: allow actual user creation --- services/finance/actual-budget-deployment.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/services/finance/actual-budget-deployment.yaml b/services/finance/actual-budget-deployment.yaml index 55186b23..637e9ae1 100644 --- a/services/finance/actual-budget-deployment.yaml +++ b/services/finance/actual-budget-deployment.yaml @@ -90,6 +90,8 @@ spec: value: openid - name: ACTUAL_MULTIUSER value: "true" + - name: ACTUAL_USER_CREATION_MODE + value: login - name: ACTUAL_OPENID_DISCOVERY_URL value: https://sso.bstein.dev/realms/atlas - name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT @@ -128,6 +130,8 @@ spec: value: openid - name: ACTUAL_MULTIUSER value: "true" + - name: ACTUAL_USER_CREATION_MODE + value: login - name: ACTUAL_OPENID_DISCOVERY_URL value: https://sso.bstein.dev/realms/atlas - name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT From c47fc2dcb8b5f83bcf97e4de0aa60bd6fce2ef75 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 19:11:58 +0000 Subject: [PATCH 177/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 94239e33..5d2a1fd3 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From c1e869785f8c5b5fb287cb7ef6f7dea3e729b6ac Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 19:13:56 +0000 Subject: [PATCH 178/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 5d2a1fd3..23381a0d 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From e55a4ee5950ea539953dc6ef71e3cc8d183af2fe Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 19:56:31 +0000 Subject: [PATCH 179/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 6f5b7dcb..617b715b 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-38 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-39 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 199a6fbac068d7b6ed1afa9048a6ff1be2c3835a Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 19:58:00 +0000 Subject: [PATCH 180/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 23381a0d..4007b7d0 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 512950094dd09457884821cc64690a1f7ad98677 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 20:00:01 +0000 Subject: [PATCH 181/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 4007b7d0..e43647c6 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 46ee074929cb8ba8a369a3b794b2d182699857ba Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 17:21:18 -0300 Subject: [PATCH 182/416] maintenance: rotate ariadne migrate job name --- services/maintenance/ariadne-migrate-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml index 3528f9be..b9b1496f 100644 --- a/services/maintenance/ariadne-migrate-job.yaml +++ b/services/maintenance/ariadne-migrate-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: ariadne-migrate + name: ariadne-migrate-2 namespace: maintenance annotations: kustomize.toolkit.fluxcd.io/force: "true" From 9c75e3973a164dac102909d870405b0fb8f3d9da Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 20:47:05 +0000 Subject: [PATCH 183/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index e43647c6..1642cbe6 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 00d76289f82e73497da0d3cfb6c4e467b41f5568 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 20:48:05 +0000 Subject: [PATCH 184/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 1642cbe6..9f989fd8 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 15a47f71be1e39385f9c0a2925a7b06d9fec667f Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 20:50:05 +0000 Subject: [PATCH 185/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 9f989fd8..b11cb44b 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 30e7833b6c0540539a04015cd598cc47f39fe067 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 20:51:05 +0000 Subject: [PATCH 186/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index b11cb44b..0039328a 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 2ef49f76b8b93a416a197c8b196e941d17c6027b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 21:27:08 +0000 Subject: [PATCH 187/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 0039328a..a5482c04 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 957b1ef0a59bbbfe1cdc0206028853261dbbdf1b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 21:28:08 +0000 Subject: [PATCH 188/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index a5482c04..17186035 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 2ed441ac7458da6261729afd2562639fa2ae61eb Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 21:44:40 +0000 Subject: [PATCH 189/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 617b715b..09636067 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-39 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-43 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 0db28faf32f3d89af629d365c7e024e8ba898ed7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 18:58:14 -0300 Subject: [PATCH 190/416] flux: force apply migrations --- .../applications/bstein-dev-home-migrations/kustomization.yaml | 1 + .../atlas/flux-system/platform/maintenance/kustomization.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml index f962de0a..da61b2d1 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml @@ -8,6 +8,7 @@ spec: interval: 10m path: ./services/bstein-dev-home/migrations prune: true + force: true sourceRef: kind: GitRepository name: flux-system diff --git a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml index fc655a4f..8477ec98 100644 --- a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml @@ -8,6 +8,7 @@ spec: interval: 10m path: ./services/maintenance prune: true + force: true sourceRef: kind: GitRepository name: flux-system From 25b123703decc15b93d6cb457ee0b313f4542585 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 22:21:43 +0000 Subject: [PATCH 191/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 09636067..18d0008a 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-43 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-44 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 65ba47d6c2c5b7c2c5a7df5b02c5175f1494b696 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 22:24:13 +0000 Subject: [PATCH 192/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 17186035..487fa644 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From f8d257bff89271fb96534ab4b4441131816c146b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 22:25:15 +0000 Subject: [PATCH 193/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 487fa644..a3914b53 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 60faa8c74aa55e0e2987541faf876afd7f41d567 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 22:39:15 +0000 Subject: [PATCH 194/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index a3914b53..a58bea72 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From f243fff94f168132fd5474ee4c436bbdb506de44 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 22:40:15 +0000 Subject: [PATCH 195/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index a58bea72..ab69f05e 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From dc1fccd687ea3af67300850fe8dc05907ad761ab Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:19:18 +0000 Subject: [PATCH 196/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index ab69f05e..2fe7ad22 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 979882c8180eb2b37e456736340ff936be003fea Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:19:21 +0000 Subject: [PATCH 197/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 2fe7ad22..06829f6e 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From e232e1868573f4afc0686777ce4d4d2b9b7299d7 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:28:20 +0000 Subject: [PATCH 198/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 06829f6e..655cfaeb 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 547c15748aaaf7e371ae072a33bdef2e4a69b683 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:28:28 +0000 Subject: [PATCH 199/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 655cfaeb..3370bb17 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 362b4a4b5b97ba68cc4b5478b37fb22ed1d3e3ff Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:52:21 +0000 Subject: [PATCH 200/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 3370bb17..9c95b907 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 994bc02f2cf344f16a3bbbc5981c06ab9fefaa6d Mon Sep 17 00:00:00 2001 From: flux-bot Date: Fri, 23 Jan 2026 23:53:21 +0000 Subject: [PATCH 201/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 9c95b907..0fa46113 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From cf779aa1968c56b3d4be7f94287af1c41c409393 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 22:30:50 -0300 Subject: [PATCH 202/416] keycloak: add vaultwarden_grandfathered flag --- services/keycloak/realm-settings-job.yaml | 1 + services/maintenance/ariadne-deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index 0de48d1e..74f569b7 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -333,6 +333,7 @@ spec: ensure_group("admin") ensure_group("demo") ensure_group("test") + ensure_group("vaultwarden_grandfathered") planka_group = ensure_group("planka-users") if planka_group and planka_group.get("id"): diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 581947c6..52d10f96 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -150,7 +150,7 @@ spec: - name: ACCOUNT_ALLOWED_GROUPS value: dev,admin - name: ALLOWED_FLAG_GROUPS - value: demo,test + value: demo,test,vaultwarden_grandfathered - name: DEFAULT_USER_GROUPS value: dev - name: MAILU_DOMAIN From 7b1d198f1d074d39a28ef2cf0889f77c06d0defc Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 01:33:29 +0000 Subject: [PATCH 203/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 0fa46113..550a7a8f 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 1fd295f7817b43c9e3accd580c6d3b877bf1640c Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 01:33:33 +0000 Subject: [PATCH 204/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 550a7a8f..efed9a33 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From d3d534d4f870dccf97bb6d0a0a61fc914cdd26fd Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 23 Jan 2026 22:41:20 -0300 Subject: [PATCH 205/416] keycloak: rerun realm settings job --- services/keycloak/realm-settings-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml index 74f569b7..9265ca3e 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/realm-settings-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: keycloak-realm-settings-35 + name: keycloak-realm-settings-36 namespace: sso spec: backoffLimit: 0 From 6492e64a03ec4c48406ee2e4dd274b276a5f9c95 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 02:05:32 +0000 Subject: [PATCH 206/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index efed9a33..f38bd96c 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From ef1e3955a76a0588fea3b2da5d80a07fd1da85bb Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 02:07:32 +0000 Subject: [PATCH 207/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f38bd96c..276c82f2 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 5ed46c0ec8cde7dc9b81c4d5aa063df6f84e7898 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 09:29:39 +0000 Subject: [PATCH 208/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 18d0008a..b3516152 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-44 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-47 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From fbf171d026a2a5b9aa1bb873ac8e5d7ccca1837e Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 10:13:43 +0000 Subject: [PATCH 209/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index b3516152..4e261cbf 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-47 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-48 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 1ca9c54aed4f61a85922f42062c8cd9e680dcb25 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 10:15:15 +0000 Subject: [PATCH 210/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 276c82f2..d7cbaf7d 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From e50ee85fda2b8bdeda094d6d8fde9d39f18c9c74 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 10:16:15 +0000 Subject: [PATCH 211/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d7cbaf7d..cab14d70 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From ae2a031fabeaba35f7eed14bcb8d6cdf5e89bc88 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 14:31:37 +0000 Subject: [PATCH 212/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index cab14d70..fad85342 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From f91d3d97dd4708c9e3c6343f60f95514085e8ac6 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 14:32:37 +0000 Subject: [PATCH 213/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index fad85342..4b21d1e2 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From d5b1e77afeb7a91de767559d6744057040d429d3 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 14:44:38 +0000 Subject: [PATCH 214/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 4b21d1e2..60db96aa 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 04c83fe98e858d9f6e4a5ca3e42b86c99a19d86d Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sat, 24 Jan 2026 14:46:38 +0000 Subject: [PATCH 215/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 60db96aa..9d34348b 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From cb6c77bc740defda863c427badebd36f38df46ad Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 24 Jan 2026 14:16:36 -0300 Subject: [PATCH 216/416] vaultwarden: bump to 1.35.2 --- knowledge/catalog/atlas.json | 2 +- knowledge/catalog/atlas.yaml | 2 +- services/comms/knowledge/catalog/atlas.json | 2 +- services/comms/knowledge/catalog/atlas.yaml | 2 +- services/vaultwarden/deployment.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json index 0d97bcd6..18cb6b64 100644 --- a/knowledge/catalog/atlas.json +++ b/knowledge/catalog/atlas.json @@ -998,7 +998,7 @@ "serviceAccountName": null, "nodeSelector": {}, "images": [ - "vaultwarden/server:1.33.2" + "vaultwarden/server:1.35.2" ] } ], diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml index f3e04a84..580a331b 100644 --- a/knowledge/catalog/atlas.yaml +++ b/knowledge/catalog/atlas.yaml @@ -672,7 +672,7 @@ workloads: serviceAccountName: null nodeSelector: {} images: - - vaultwarden/server:1.33.2 + - vaultwarden/server:1.35.2 services: - namespace: ai name: ollama diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json index 0d97bcd6..18cb6b64 100644 --- a/services/comms/knowledge/catalog/atlas.json +++ b/services/comms/knowledge/catalog/atlas.json @@ -998,7 +998,7 @@ "serviceAccountName": null, "nodeSelector": {}, "images": [ - "vaultwarden/server:1.33.2" + "vaultwarden/server:1.35.2" ] } ], diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml index 6529e1a4..67f2fcb2 100644 --- a/services/comms/knowledge/catalog/atlas.yaml +++ b/services/comms/knowledge/catalog/atlas.yaml @@ -672,7 +672,7 @@ workloads: serviceAccountName: null nodeSelector: {} images: - - vaultwarden/server:1.33.2 + - vaultwarden/server:1.35.2 services: - namespace: ai name: ollama diff --git a/services/vaultwarden/deployment.yaml b/services/vaultwarden/deployment.yaml index 2893a924..e1d888a8 100644 --- a/services/vaultwarden/deployment.yaml +++ b/services/vaultwarden/deployment.yaml @@ -39,7 +39,7 @@ spec: node-role.kubernetes.io/worker: "true" containers: - name: vaultwarden - image: vaultwarden/server:1.33.2 + image: vaultwarden/server:1.35.2 command: ["/bin/sh", "-c"] args: - >- From b18d0d40bc9c5115150d1d0f219b323de0431405 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 00:06:26 +0000 Subject: [PATCH 217/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 9d34348b..63eaebf2 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 531e573c004ff06c522ae961359cb5da3ff1e2a2 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 00:07:26 +0000 Subject: [PATCH 218/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 63eaebf2..1511f5c6 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 7fec7d4fd12cd02e12855f63a6020d4b185b490b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 17:39:57 +0000 Subject: [PATCH 219/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 1511f5c6..7ed1b524 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 936d599c6ec5e0ed439ed4709d4f2242376b0b89 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 17:40:57 +0000 Subject: [PATCH 220/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 7ed1b524..0890f593 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 7e004efe65a8d82a4705bb26c63c7c73b26a2323 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 18:04:59 +0000 Subject: [PATCH 221/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 0890f593..c0aff7fa 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 7c4af51287ad206de6e3beaea16053af3cedcdfe Mon Sep 17 00:00:00 2001 From: flux-bot Date: Sun, 25 Jan 2026 18:06:59 +0000 Subject: [PATCH 222/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index c0aff7fa..90c3b8de 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 58267ab522403842a53222b77dbf68aac0b3f5ec Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 25 Jan 2026 15:59:12 -0300 Subject: [PATCH 223/416] comms: route atlasbot to chat gateway --- services/bstein-dev-home/chat-ai-gateway-deployment.yaml | 2 +- services/comms/atlasbot-deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml index 3010a9b0..40d74fe1 100644 --- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml +++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml @@ -5,7 +5,7 @@ metadata: name: chat-ai-gateway namespace: bstein-dev-home spec: - replicas: 0 + replicas: 1 revisionHistoryLimit: 2 selector: matchLabels: diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 46180539..278a008f 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -76,7 +76,7 @@ spec: - name: BOT_USER value: atlasbot - name: OLLAMA_URL - value: https://chat.ai.bstein.dev/ + value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL value: qwen2.5-coder:7b-instruct-q4_0 resources: From 712bba23a1ec5c38b092466c83ecd58dc8239f55 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 25 Jan 2026 16:19:15 -0300 Subject: [PATCH 224/416] ai: restart ollama deployment --- services/ai-llm/deployment.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml index fa354408..dfa1bdd1 100644 --- a/services/ai-llm/deployment.yaml +++ b/services/ai-llm/deployment.yaml @@ -22,6 +22,7 @@ spec: annotations: ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0 ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24) + ai.bstein.dev/restartedAt: "2026-01-25T19:10:00Z" spec: affinity: nodeAffinity: From 08be13fe91e073062a5435af12c3478363e51805 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 00:52:35 -0300 Subject: [PATCH 225/416] comms: normalize atlasbot replies --- services/comms/atlasbot-deployment.yaml | 4 ++- services/comms/scripts/atlasbot/bot.py | 34 ++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 278a008f..c2bc108d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-4 + checksum/atlasbot-configmap: manual-atlasbot-5 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -75,6 +75,8 @@ spec: value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 - name: BOT_USER value: atlasbot + - name: BOT_MENTIONS + value: atlasbot - name: OLLAMA_URL value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e8bd1a83..3da93ba5 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -71,6 +71,8 @@ METRIC_HINT_WORDS = { "latency", } +CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL) + def _tokens(text: str) -> list[str]: toks = [t.lower() for t in TOKEN_RE.findall(text or "")] return [t for t in toks if t not in STOPWORDS and len(t) >= 2] @@ -442,6 +444,35 @@ def vm_cluster_snapshot() -> str: parts.append(pr) return "\n".join(parts).strip() +def _strip_code_fence(text: str) -> str: + cleaned = (text or "").strip() + match = CODE_FENCE_RE.match(cleaned) + if match: + return match.group(1).strip() + return cleaned + +def _normalize_reply(value: Any) -> str: + if isinstance(value, dict): + for key in ("content", "response", "reply", "message"): + if key in value: + return _normalize_reply(value[key]) + for v in value.values(): + if isinstance(v, (str, dict, list)): + return _normalize_reply(v) + return json.dumps(value, ensure_ascii=False) + if isinstance(value, list): + parts = [_normalize_reply(item) for item in value] + return " ".join(p for p in parts if p) + if value is None: + return "" + text = _strip_code_fence(str(value)) + if text.startswith("{") and text.endswith("}"): + try: + return _normalize_reply(json.loads(text)) + except Exception: + return text + return text + # Conversation state. history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] (short transcript) @@ -511,7 +542,8 @@ def ollama_reply(hist_key, prompt: str, *, context: str) -> str: r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) with request.urlopen(r, timeout=20) as resp: data = json.loads(resp.read().decode()) - reply = data.get("message") or data.get("response") or data.get("reply") or "I'm here to help." + raw_reply = data.get("message") or data.get("response") or data.get("reply") or data + reply = _normalize_reply(raw_reply) or "I'm here to help." history[hist_key].append(f"Atlas: {reply}") return reply except Exception: From e7f8290807cb1dfc0a60bdb037cf6d968e7e1052 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 01:07:49 -0300 Subject: [PATCH 226/416] comms: answer node count queries --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 33 ++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index c2bc108d..7a258acd 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-5 + checksum/atlasbot-configmap: manual-atlasbot-6 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 3da93ba5..69c1b84b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -444,6 +444,28 @@ def vm_cluster_snapshot() -> str: parts.append(pr) return "\n".join(parts).strip() +def nodes_summary(cluster_name: str) -> str: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return "" + items = data.get("items") or [] + if not isinstance(items, list) or not items: + return "" + total = len(items) + ready = 0 + for node in items: + conditions = node.get("status", {}).get("conditions") or [] + for cond in conditions if isinstance(conditions, list) else []: + if cond.get("type") == "Ready": + if cond.get("status") == "True": + ready += 1 + break + not_ready = max(total - ready, 0) + if not_ready: + return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady." + return f"{cluster_name} cluster has {total} nodes, all Ready." + def _strip_code_fence(text: str) -> str: cleaned = (text or "").strip() match = CODE_FENCE_RE.match(cleaned) @@ -526,7 +548,8 @@ def ollama_reply(hist_key, prompt: str, *, context: str) -> str: "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " "Prefer answering with exact repo paths and Kubernetes resource names. " - "Never include or request secret values." + "Never include or request secret values. " + "Respond in plain sentences; do not return JSON or code fences unless explicitly asked." ) transcript_parts = [system] if context: @@ -601,6 +624,14 @@ def sync_loop(token: str, room_id: str): if not (is_dm or mentioned): continue + lower_body = body.lower() + if re.search(r"\\bhow many nodes\\b|\\bnode count\\b|\\bnumber of nodes\\b", lower_body): + if any(word in lower_body for word in ("cluster", "atlas", "titan")): + summary = nodes_summary("Atlas") + if summary: + send_msg(token, rid, summary) + continue + # Only do live cluster/metrics introspection in DMs. allow_tools = is_dm From 75c8a21b466918ba469b5d27ad374d53de695255 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 01:32:01 -0300 Subject: [PATCH 227/416] comms: fix atlasbot node count matcher --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7a258acd..fe1e9066 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-6 + checksum/atlasbot-configmap: manual-atlasbot-7 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 69c1b84b..b2ac1c9b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -625,12 +625,14 @@ def sync_loop(token: str, room_id: str): continue lower_body = body.lower() - if re.search(r"\\bhow many nodes\\b|\\bnode count\\b|\\bnumber of nodes\\b", lower_body): + if re.search(r"\bhow many nodes\b|\bnode count\b|\bnumber of nodes\b", lower_body): if any(word in lower_body for word in ("cluster", "atlas", "titan")): summary = nodes_summary("Atlas") - if summary: - send_msg(token, rid, summary) + if not summary: + send_msg(token, rid, "I couldn’t reach the cluster API to count nodes. Try again in a moment.") continue + send_msg(token, rid, summary) + continue # Only do live cluster/metrics introspection in DMs. allow_tools = is_dm From 9c3328a030d21c41cbcba5b4957693d224932c9d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 01:35:47 -0300 Subject: [PATCH 228/416] comms: answer node name queries --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 29 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index fe1e9066..7aedf4a0 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-7 + checksum/atlasbot-configmap: manual-atlasbot-8 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index b2ac1c9b..6fb6bff0 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -466,6 +466,27 @@ def nodes_summary(cluster_name: str) -> str: return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady." return f"{cluster_name} cluster has {total} nodes, all Ready." +def nodes_names_summary(cluster_name: str) -> str: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return "" + items = data.get("items") or [] + if not isinstance(items, list) or not items: + return "" + names = [] + for node in items: + name = (node.get("metadata") or {}).get("name") or "" + if name: + names.append(name) + names = sorted(set(names)) + if not names: + return "" + if len(names) <= 30: + return f"{cluster_name} node names: {', '.join(names)}." + shown = ", ".join(names[:30]) + return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)." + def _strip_code_fence(text: str) -> str: cleaned = (text or "").strip() match = CODE_FENCE_RE.match(cleaned) @@ -633,6 +654,14 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue + if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body): + if any(word in lower_body for word in ("cluster", "atlas", "titan")): + names_summary = nodes_names_summary("Atlas") + if not names_summary: + send_msg(token, rid, "I couldn’t reach the cluster API to list node names. Try again in a moment.") + continue + send_msg(token, rid, names_summary) + continue # Only do live cluster/metrics introspection in DMs. allow_tools = is_dm From 87db5b2bd2ed7d6aafe0810e6adfcf803b06e905 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 03:32:17 -0300 Subject: [PATCH 229/416] comms: sync atlas knowledge and use ariadne state --- knowledge/catalog/atlas-summary.json | 8 +- knowledge/catalog/atlas.json | 706 ++++++++++++++++-- knowledge/catalog/atlas.yaml | 494 ++++++++++-- knowledge/diagrams/atlas-http.mmd | 43 +- scripts/knowledge_render_atlas.py | 17 + services/comms/atlasbot-deployment.yaml | 4 +- .../knowledge/catalog/atlas-summary.json | 8 +- services/comms/knowledge/catalog/atlas.json | 706 ++++++++++++++++-- services/comms/knowledge/catalog/atlas.yaml | 496 ++++++++++-- .../comms/knowledge/catalog/runbooks.json | 16 + .../comms/knowledge/diagrams/atlas-http.mmd | 43 +- services/comms/knowledge/metis.md | 26 + .../comms/knowledge/runbooks/comms-verify.md | 30 + services/comms/knowledge/software/metis.md | 73 ++ services/comms/scripts/atlasbot/bot.py | 38 + services/maintenance/ariadne-deployment.yaml | 8 + services/maintenance/ariadne-rbac.yaml | 15 + 17 files changed, 2453 insertions(+), 278 deletions(-) create mode 100644 services/comms/knowledge/metis.md create mode 100644 services/comms/knowledge/runbooks/comms-verify.md create mode 100644 services/comms/knowledge/software/metis.md diff --git a/knowledge/catalog/atlas-summary.json b/knowledge/catalog/atlas-summary.json index fa350516..ea825ce7 100644 --- a/knowledge/catalog/atlas-summary.json +++ b/knowledge/catalog/atlas-summary.json @@ -1,8 +1,8 @@ { "counts": { - "helmrelease_host_hints": 17, - "http_endpoints": 37, - "services": 43, - "workloads": 54 + "helmrelease_host_hints": 19, + "http_endpoints": 45, + "services": 47, + "workloads": 74 } } diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json index 18cb6b64..21ac4073 100644 --- a/knowledge/catalog/atlas.json +++ b/knowledge/catalog/atlas.json @@ -11,6 +11,21 @@ "path": "services/bstein-dev-home", "targetNamespace": "bstein-dev-home" }, + { + "name": "bstein-dev-home-migrations", + "path": "services/bstein-dev-home/migrations", + "targetNamespace": "bstein-dev-home" + }, + { + "name": "cert-manager", + "path": "infrastructure/cert-manager", + "targetNamespace": "cert-manager" + }, + { + "name": "cert-manager-cleanup", + "path": "infrastructure/cert-manager/cleanup", + "targetNamespace": "cert-manager" + }, { "name": "comms", "path": "services/comms", @@ -26,6 +41,11 @@ "path": "services/crypto", "targetNamespace": "crypto" }, + { + "name": "finance", + "path": "services/finance", + "targetNamespace": "finance" + }, { "name": "flux-system", "path": "clusters/atlas/flux-system", @@ -46,6 +66,11 @@ "path": "services/harbor", "targetNamespace": "harbor" }, + { + "name": "health", + "path": "services/health", + "targetNamespace": "health" + }, { "name": "helm", "path": "infrastructure/sources/helm", @@ -71,6 +96,16 @@ "path": "services/logging", "targetNamespace": null }, + { + "name": "longhorn", + "path": "infrastructure/longhorn/core", + "targetNamespace": "longhorn-system" + }, + { + "name": "longhorn-adopt", + "path": "infrastructure/longhorn/adopt", + "targetNamespace": "longhorn-system" + }, { "name": "longhorn-ui", "path": "infrastructure/longhorn/ui-ingress", @@ -161,11 +196,21 @@ "path": "infrastructure/vault-csi", "targetNamespace": "kube-system" }, + { + "name": "vault-injector", + "path": "infrastructure/vault-injector", + "targetNamespace": "vault" + }, { "name": "vaultwarden", "path": "services/vaultwarden", "targetNamespace": "vaultwarden" }, + { + "name": "wallet-monero-temp", + "path": "services/crypto/wallet-monero-temp", + "targetNamespace": "crypto" + }, { "name": "xmr-miner", "path": "services/crypto/xmr-miner", @@ -199,7 +244,7 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157" ] }, { @@ -215,7 +260,20 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-vault-sync", + "labels": { + "app": "bstein-dev-home-vault-sync" + }, + "serviceAccountName": "bstein-dev-home-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -225,7 +283,7 @@ "labels": { "app": "chat-ai-gateway" }, - "serviceAccountName": null, + "serviceAccountName": "bstein-dev-home", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -249,6 +307,19 @@ "python:3.11-slim" ] }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "comms-vault-sync", + "labels": { + "app": "comms-vault-sync" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "comms", @@ -256,7 +327,7 @@ "labels": { "app": "coturn" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -286,7 +357,7 @@ "labels": { "app": "livekit" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -301,12 +372,12 @@ "labels": { "app": "livekit-token-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, "images": [ - "ghcr.io/element-hq/lk-jwt-service:0.3.0" + "registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0" ] }, { @@ -316,7 +387,7 @@ "labels": { "app": "matrix-authentication-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -331,7 +402,7 @@ "labels": { "app.kubernetes.io/name": "matrix-guest-register" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": {}, "images": [ "python:3.11-slim" @@ -365,6 +436,19 @@ "ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "crypto-vault-sync", + "labels": { + "app": "crypto-vault-sync" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "crypto", @@ -372,7 +456,7 @@ "labels": { "app": "monero-p2pool" }, - "serviceAccountName": null, + "serviceAccountName": "crypto-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -395,6 +479,53 @@ "registry.bstein.dev/crypto/monerod:0.18.4.1" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "wallet-monero-temp", + "labels": { + "app": "wallet-monero-temp" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "actual-budget", + "labels": { + "app": "actual-budget" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "firefly", + "labels": { + "app": "firefly" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "fireflyiii/core:version-6.4.15" + ] + }, { "kind": "Deployment", "namespace": "flux-system", @@ -516,7 +647,7 @@ "labels": { "app": "gitea" }, - "serviceAccountName": null, + "serviceAccountName": "gitea-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -524,6 +655,36 @@ "gitea/gitea:1.23" ] }, + { + "kind": "Deployment", + "namespace": "harbor", + "name": "harbor-vault-sync", + "labels": { + "app": "harbor-vault-sync" + }, + "serviceAccountName": "harbor-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "health", + "name": "wger", + "labels": { + "app": "wger" + }, + "serviceAccountName": "health-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10", + "wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5" + ] + }, { "kind": "Deployment", "namespace": "jellyfin", @@ -531,7 +692,7 @@ "labels": { "app": "jellyfin" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": {}, "images": [ "docker.io/jellyfin/jellyfin:10.11.5" @@ -544,14 +705,27 @@ "labels": { "app": "pegasus" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" }, "images": [ "alpine:3.20", - "registry.bstein.dev/streaming/pegasus:1.2.32" + "registry.bstein.dev/streaming/pegasus-vault:1.2.32" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "pegasus-vault-sync", + "labels": { + "app": "pegasus-vault-sync" + }, + "serviceAccountName": "pegasus-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -570,6 +744,35 @@ "jenkins/jenkins:2.528.3-jdk21" ] }, + { + "kind": "Deployment", + "namespace": "jenkins", + "name": "jenkins-vault-sync", + "labels": { + "app": "jenkins-vault-sync" + }, + "serviceAccountName": "jenkins-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "ntp-sync", + "labels": { + "app": "ntp-sync" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "public.ecr.aws/docker/library/busybox:1.36.1" + ] + }, { "kind": "DaemonSet", "namespace": "kube-system", @@ -636,6 +839,21 @@ "hashicorp/vault-csi-provider:1.7.0" ] }, + { + "kind": "Deployment", + "namespace": "kube-system", + "name": "coredns", + "labels": { + "k8s-app": "kube-dns" + }, + "serviceAccountName": "coredns", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "registry.bstein.dev/infra/coredns:1.12.1" + ] + }, { "kind": "DaemonSet", "namespace": "logging", @@ -681,6 +899,19 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "logging", + "name": "logging-vault-sync", + "labels": { + "app": "logging-vault-sync" + }, + "serviceAccountName": "logging-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "logging", @@ -688,12 +919,27 @@ "labels": { "app": "oauth2-proxy-logs" }, - "serviceAccountName": null, + "serviceAccountName": "logging-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "longhorn-system", + "name": "longhorn-vault-sync", + "labels": { + "app": "longhorn-vault-sync" + }, + "serviceAccountName": "longhorn-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" ] }, { @@ -703,7 +949,7 @@ "labels": { "app": "oauth2-proxy-longhorn" }, - "serviceAccountName": null, + "serviceAccountName": "longhorn-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -729,14 +975,45 @@ { "kind": "Deployment", "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "name": "mailu-vault-sync", "labels": { - "app": "mailu-sync-listener" + "app": "mailu-vault-sync" }, - "serviceAccountName": null, + "serviceAccountName": "mailu-vault-sync", "nodeSelector": {}, "images": [ - "python:3.11-alpine" + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "disable-k3s-traefik", + "labels": { + "app": "disable-k3s-traefik" + }, + "serviceAccountName": "disable-k3s-traefik", + "nodeSelector": { + "node-role.kubernetes.io/control-plane": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "k3s-agent-restart", + "labels": { + "app": "k3s-agent-restart" + }, + "serviceAccountName": "node-nofile", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, { @@ -767,6 +1044,35 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "ariadne", + "labels": { + "app": "ariadne" + }, + "serviceAccountName": "ariadne", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/ariadne:0.1.0-48" + ] + }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "maintenance-vault-sync", + "labels": { + "app": "maintenance-vault-sync" + }, + "serviceAccountName": "maintenance-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "DaemonSet", "namespace": "monitoring", @@ -795,6 +1101,19 @@ "python:3.10-slim" ] }, + { + "kind": "Deployment", + "namespace": "monitoring", + "name": "monitoring-vault-sync", + "labels": { + "app": "monitoring-vault-sync" + }, + "serviceAccountName": "monitoring-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "monitoring", @@ -802,7 +1121,7 @@ "labels": { "app": "postmark-exporter" }, - "serviceAccountName": null, + "serviceAccountName": "monitoring-vault-sync", "nodeSelector": {}, "images": [ "python:3.12-alpine" @@ -830,7 +1149,7 @@ "labels": { "app": "nextcloud" }, - "serviceAccountName": null, + "serviceAccountName": "nextcloud-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -845,7 +1164,7 @@ "labels": { "app": "outline" }, - "serviceAccountName": null, + "serviceAccountName": "outline-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -875,7 +1194,7 @@ "labels": { "app": "planka" }, - "serviceAccountName": null, + "serviceAccountName": "planka-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -895,7 +1214,8 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "postgres:15" + "postgres:15", + "quay.io/prometheuscommunity/postgres-exporter:v0.15.0" ] }, { @@ -905,8 +1225,11 @@ "labels": { "app": "keycloak" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "sso-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ "quay.io/keycloak/keycloak:26.0.7" ] @@ -918,12 +1241,25 @@ "labels": { "app": "oauth2-proxy" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "sso-vault-sync", + "labels": { + "app": "sso-vault-sync" + }, + "serviceAccountName": "sso-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -933,7 +1269,7 @@ "labels": { "app": "openldap" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -951,7 +1287,7 @@ }, "serviceAccountName": "sui-metrics", "nodeSelector": { - "kubernetes.io/hostname": "titan-24" + "hardware": "rpi5" }, "images": [ "victoriametrics/vmagent:v1.103.0" @@ -962,7 +1298,9 @@ "namespace": "traefik", "name": "traefik", "labels": { - "app": "traefik" + "app": "traefik", + "app.kubernetes.io/instance": "traefik-kube-system", + "app.kubernetes.io/name": "traefik" }, "serviceAccountName": "traefik-ingress-controller", "nodeSelector": { @@ -995,8 +1333,11 @@ "labels": { "app": "vaultwarden" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "vaultwarden-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ "vaultwarden/server:1.35.2" ] @@ -1565,6 +1906,54 @@ } ] }, + { + "namespace": "crypto", + "name": "wallet-monero-temp", + "type": "ClusterIP", + "selector": { + "app": "wallet-monero-temp" + }, + "ports": [ + { + "name": "rpc", + "port": 18083, + "targetPort": 18083, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "actual-budget", + "type": "ClusterIP", + "selector": { + "app": "actual-budget" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 5006, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "firefly", + "type": "ClusterIP", + "selector": { + "app": "firefly" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, { "namespace": "flux-system", "name": "notification-controller", @@ -1632,7 +2021,7 @@ { "namespace": "gitea", "name": "gitea-ssh", - "type": "NodePort", + "type": "LoadBalancer", "selector": { "app": "gitea" }, @@ -1645,6 +2034,22 @@ } ] }, + { + "namespace": "health", + "name": "wger", + "type": "ClusterIP", + "selector": { + "app": "wger" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, { "namespace": "jellyfin", "name": "jellyfin", @@ -1699,29 +2104,6 @@ } ] }, - { - "namespace": "kube-system", - "name": "traefik", - "type": "LoadBalancer", - "selector": { - "app.kubernetes.io/instance": "traefik-kube-system", - "app.kubernetes.io/name": "traefik" - }, - "ports": [ - { - "name": "web", - "port": 80, - "targetPort": "web", - "protocol": "TCP" - }, - { - "name": "websecure", - "port": 443, - "targetPort": "websecure", - "protocol": "TCP" - } - ] - }, { "namespace": "logging", "name": "oauth2-proxy-logs", @@ -1803,17 +2185,17 @@ ] }, { - "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "namespace": "maintenance", + "name": "ariadne", "type": "ClusterIP", "selector": { - "app": "mailu-sync-listener" + "app": "ariadne" }, "ports": [ { "name": "http", - "port": 8080, - "targetPort": 8080, + "port": 80, + "targetPort": "http", "protocol": "TCP" } ] @@ -1959,6 +2341,12 @@ "port": 5432, "targetPort": 5432, "protocol": "TCP" + }, + { + "name": "metrics", + "port": 9187, + "targetPort": 9187, + "protocol": "TCP" } ] }, @@ -2032,6 +2420,28 @@ } ] }, + { + "namespace": "traefik", + "name": "traefik", + "type": "LoadBalancer", + "selector": { + "app": "traefik" + }, + "ports": [ + { + "name": "web", + "port": 80, + "targetPort": "web", + "protocol": "TCP" + }, + { + "name": "websecure", + "port": 443, + "targetPort": "websecure", + "protocol": "TCP" + } + ] + }, { "namespace": "traefik", "name": "traefik-metrics", @@ -2210,6 +2620,26 @@ "source": "bstein-dev-home" } }, + { + "host": "budget.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "actual-budget", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "actual-budget" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "actual-budget", + "source": "finance" + } + }, { "host": "call.live.bstein.dev", "path": "/", @@ -2290,6 +2720,26 @@ "source": "nextcloud" } }, + { + "host": "health.bstein.dev", + "path": "/", + "backend": { + "namespace": "health", + "service": "wger", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "wger" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "wger", + "source": "health" + } + }, { "host": "kit.live.bstein.dev", "path": "/livekit/jwt", @@ -2385,6 +2835,106 @@ "source": "comms" } }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/r0/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/login", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/logout", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/refresh", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, { "host": "logs.bstein.dev", "path": "/", @@ -2650,6 +3200,26 @@ "source": "monerod" } }, + { + "host": "money.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "firefly", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "firefly" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "firefly", + "source": "finance" + } + }, { "host": "notes.bstein.dev", "path": "/", @@ -2838,7 +3408,6 @@ "matrix.live.bstein.dev" ], "comms:comms/othrys-synapse": [ - "bstein.dev", "kit.live.bstein.dev", "live.bstein.dev", "matrix.live.bstein.dev", @@ -2853,6 +3422,9 @@ "logging:logging/data-prepper": [ "registry.bstein.dev" ], + "longhorn:longhorn-system/longhorn": [ + "registry.bstein.dev" + ], "mailu:mailu-mailserver/mailu": [ "bstein.dev", "mail.bstein.dev" @@ -2862,8 +3434,12 @@ ], "monitoring:monitoring/grafana": [ "bstein.dev", + "mail.bstein.dev", "metrics.bstein.dev", "sso.bstein.dev" + ], + "monitoring:monitoring/kube-state-metrics": [ + "atlas.bstein.dev" ] } } diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml index 580a331b..b3b0119f 100644 --- a/knowledge/catalog/atlas.yaml +++ b/knowledge/catalog/atlas.yaml @@ -8,6 +8,15 @@ sources: - name: bstein-dev-home path: services/bstein-dev-home targetNamespace: bstein-dev-home +- name: bstein-dev-home-migrations + path: services/bstein-dev-home/migrations + targetNamespace: bstein-dev-home +- name: cert-manager + path: infrastructure/cert-manager + targetNamespace: cert-manager +- name: cert-manager-cleanup + path: infrastructure/cert-manager/cleanup + targetNamespace: cert-manager - name: comms path: services/comms targetNamespace: comms @@ -17,6 +26,9 @@ sources: - name: crypto path: services/crypto targetNamespace: crypto +- name: finance + path: services/finance + targetNamespace: finance - name: flux-system path: clusters/atlas/flux-system targetNamespace: null @@ -29,6 +41,9 @@ sources: - name: harbor path: services/harbor targetNamespace: harbor +- name: health + path: services/health + targetNamespace: health - name: helm path: infrastructure/sources/helm targetNamespace: flux-system @@ -44,6 +59,12 @@ sources: - name: logging path: services/logging targetNamespace: null +- name: longhorn + path: infrastructure/longhorn/core + targetNamespace: longhorn-system +- name: longhorn-adopt + path: infrastructure/longhorn/adopt + targetNamespace: longhorn-system - name: longhorn-ui path: infrastructure/longhorn/ui-ingress targetNamespace: longhorn-system @@ -98,9 +119,15 @@ sources: - name: vault-csi path: infrastructure/vault-csi targetNamespace: kube-system +- name: vault-injector + path: infrastructure/vault-injector + targetNamespace: vault - name: vaultwarden path: services/vaultwarden targetNamespace: vaultwarden +- name: wallet-monero-temp + path: services/crypto/wallet-monero-temp + targetNamespace: crypto - name: xmr-miner path: services/crypto/xmr-miner targetNamespace: crypto @@ -124,7 +151,7 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157 - kind: Deployment namespace: bstein-dev-home name: bstein-dev-home-frontend @@ -135,13 +162,22 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157 +- kind: Deployment + namespace: bstein-dev-home + name: bstein-dev-home-vault-sync + labels: + app: bstein-dev-home-vault-sync + serviceAccountName: bstein-dev-home-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: bstein-dev-home name: chat-ai-gateway labels: app: chat-ai-gateway - serviceAccountName: null + serviceAccountName: bstein-dev-home nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -157,12 +193,21 @@ workloads: hardware: rpi5 images: - python:3.11-slim +- kind: Deployment + namespace: comms + name: comms-vault-sync + labels: + app: comms-vault-sync + serviceAccountName: comms-vault + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: comms name: coturn labels: app: coturn - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -182,7 +227,7 @@ workloads: name: livekit labels: app: livekit - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -192,17 +237,17 @@ workloads: name: livekit-token-service labels: app: livekit-token-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: - - ghcr.io/element-hq/lk-jwt-service:0.3.0 + - registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0 - kind: Deployment namespace: comms name: matrix-authentication-service labels: app: matrix-authentication-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -212,7 +257,7 @@ workloads: name: matrix-guest-register labels: app.kubernetes.io/name: matrix-guest-register - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: {} images: - python:3.11-slim @@ -235,12 +280,21 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9 +- kind: Deployment + namespace: crypto + name: crypto-vault-sync + labels: + app: crypto-vault-sync + serviceAccountName: crypto-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: crypto name: monero-p2pool labels: app: monero-p2pool - serviceAccountName: null + serviceAccountName: crypto-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -255,6 +309,38 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - registry.bstein.dev/crypto/monerod:0.18.4.1 +- kind: Deployment + namespace: crypto + name: wallet-monero-temp + labels: + app: wallet-monero-temp + serviceAccountName: crypto-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1 +- kind: Deployment + namespace: finance + name: actual-budget + labels: + app: actual-budget + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d +- kind: Deployment + namespace: finance + name: firefly + labels: + app: firefly + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - fireflyiii/core:version-6.4.15 - kind: Deployment namespace: flux-system name: helm-controller @@ -344,17 +430,38 @@ workloads: name: gitea labels: app: gitea - serviceAccountName: null + serviceAccountName: gitea-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - gitea/gitea:1.23 +- kind: Deployment + namespace: harbor + name: harbor-vault-sync + labels: + app: harbor-vault-sync + serviceAccountName: harbor-vault-sync + nodeSelector: {} + images: + - alpine:3.20 +- kind: Deployment + namespace: health + name: wger + labels: + app: wger + serviceAccountName: health-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10 + - wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5 - kind: Deployment namespace: jellyfin name: jellyfin labels: app: jellyfin - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: {} images: - docker.io/jellyfin/jellyfin:10.11.5 @@ -363,13 +470,22 @@ workloads: name: pegasus labels: app: pegasus - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - alpine:3.20 - - registry.bstein.dev/streaming/pegasus:1.2.32 + - registry.bstein.dev/streaming/pegasus-vault:1.2.32 +- kind: Deployment + namespace: jellyfin + name: pegasus-vault-sync + labels: + app: pegasus-vault-sync + serviceAccountName: pegasus-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: jenkins name: jenkins @@ -381,6 +497,26 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - jenkins/jenkins:2.528.3-jdk21 +- kind: Deployment + namespace: jenkins + name: jenkins-vault-sync + labels: + app: jenkins-vault-sync + serviceAccountName: jenkins-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 +- kind: DaemonSet + namespace: kube-system + name: ntp-sync + labels: + app: ntp-sync + serviceAccountName: null + nodeSelector: {} + images: + - public.ecr.aws/docker/library/busybox:1.36.1 - kind: DaemonSet namespace: kube-system name: nvidia-device-plugin-jetson @@ -427,6 +563,16 @@ workloads: kubernetes.io/os: linux images: - hashicorp/vault-csi-provider:1.7.0 +- kind: Deployment + namespace: kube-system + name: coredns + labels: + k8s-app: kube-dns + serviceAccountName: coredns + nodeSelector: + kubernetes.io/os: linux + images: + - registry.bstein.dev/infra/coredns:1.12.1 - kind: DaemonSet namespace: logging name: node-image-gc-rpi4 @@ -457,22 +603,41 @@ workloads: hardware: rpi5 images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: logging + name: logging-vault-sync + labels: + app: logging-vault-sync + serviceAccountName: logging-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: logging name: oauth2-proxy-logs labels: app: oauth2-proxy-logs - serviceAccountName: null + serviceAccountName: logging-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: longhorn-system + name: longhorn-vault-sync + labels: + app: longhorn-vault-sync + serviceAccountName: longhorn-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 - kind: Deployment namespace: longhorn-system name: oauth2-proxy-longhorn labels: app: oauth2-proxy-longhorn - serviceAccountName: null + serviceAccountName: longhorn-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -489,13 +654,34 @@ workloads: - registry.bstein.dev/bstein/kubectl:1.35.0 - kind: Deployment namespace: mailu-mailserver - name: mailu-sync-listener + name: mailu-vault-sync labels: - app: mailu-sync-listener - serviceAccountName: null + app: mailu-vault-sync + serviceAccountName: mailu-vault-sync nodeSelector: {} images: - - python:3.11-alpine + - alpine:3.20 +- kind: DaemonSet + namespace: maintenance + name: disable-k3s-traefik + labels: + app: disable-k3s-traefik + serviceAccountName: disable-k3s-traefik + nodeSelector: + node-role.kubernetes.io/control-plane: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: DaemonSet + namespace: maintenance + name: k3s-agent-restart + labels: + app: k3s-agent-restart + serviceAccountName: node-nofile + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 - kind: DaemonSet namespace: maintenance name: node-image-sweeper @@ -515,6 +701,26 @@ workloads: nodeSelector: {} images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: maintenance + name: ariadne + labels: + app: ariadne + serviceAccountName: ariadne + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/bstein/ariadne:0.1.0-48 +- kind: Deployment + namespace: maintenance + name: maintenance-vault-sync + labels: + app: maintenance-vault-sync + serviceAccountName: maintenance-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: DaemonSet namespace: monitoring name: dcgm-exporter @@ -534,12 +740,21 @@ workloads: jetson: 'true' images: - python:3.10-slim +- kind: Deployment + namespace: monitoring + name: monitoring-vault-sync + labels: + app: monitoring-vault-sync + serviceAccountName: monitoring-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: monitoring name: postmark-exporter labels: app: postmark-exporter - serviceAccountName: null + serviceAccountName: monitoring-vault-sync nodeSelector: {} images: - python:3.12-alpine @@ -558,7 +773,7 @@ workloads: name: nextcloud labels: app: nextcloud - serviceAccountName: null + serviceAccountName: nextcloud-vault nodeSelector: hardware: rpi5 images: @@ -568,7 +783,7 @@ workloads: name: outline labels: app: outline - serviceAccountName: null + serviceAccountName: outline-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -588,7 +803,7 @@ workloads: name: planka labels: app: planka - serviceAccountName: null + serviceAccountName: planka-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -603,13 +818,16 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - postgres:15 + - quay.io/prometheuscommunity/postgres-exporter:v0.15.0 - kind: Deployment namespace: sso name: keycloak labels: app: keycloak - serviceAccountName: null - nodeSelector: {} + serviceAccountName: sso-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - quay.io/keycloak/keycloak:26.0.7 - kind: Deployment @@ -617,17 +835,26 @@ workloads: name: oauth2-proxy labels: app: oauth2-proxy - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: sso + name: sso-vault-sync + labels: + app: sso-vault-sync + serviceAccountName: sso-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: StatefulSet namespace: sso name: openldap labels: app: openldap - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -640,7 +867,7 @@ workloads: app: sui-metrics serviceAccountName: sui-metrics nodeSelector: - kubernetes.io/hostname: titan-24 + hardware: rpi5 images: - victoriametrics/vmagent:v1.103.0 - kind: Deployment @@ -648,6 +875,8 @@ workloads: name: traefik labels: app: traefik + app.kubernetes.io/instance: traefik-kube-system + app.kubernetes.io/name: traefik serviceAccountName: traefik-ingress-controller nodeSelector: node-role.kubernetes.io/worker: 'true' @@ -669,8 +898,10 @@ workloads: name: vaultwarden labels: app: vaultwarden - serviceAccountName: null - nodeSelector: {} + serviceAccountName: vaultwarden-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - vaultwarden/server:1.35.2 services: @@ -1040,6 +1271,36 @@ services: port: 3333 targetPort: 3333 protocol: TCP +- namespace: crypto + name: wallet-monero-temp + type: ClusterIP + selector: + app: wallet-monero-temp + ports: + - name: rpc + port: 18083 + targetPort: 18083 + protocol: TCP +- namespace: finance + name: actual-budget + type: ClusterIP + selector: + app: actual-budget + ports: + - name: http + port: 80 + targetPort: 5006 + protocol: TCP +- namespace: finance + name: firefly + type: ClusterIP + selector: + app: firefly + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP - namespace: flux-system name: notification-controller type: ClusterIP @@ -1082,7 +1343,7 @@ services: protocol: TCP - namespace: gitea name: gitea-ssh - type: NodePort + type: LoadBalancer selector: app: gitea ports: @@ -1090,6 +1351,16 @@ services: port: 2242 targetPort: 2242 protocol: TCP +- namespace: health + name: wger + type: ClusterIP + selector: + app: wger + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP - namespace: jellyfin name: jellyfin type: ClusterIP @@ -1124,21 +1395,6 @@ services: port: 50000 targetPort: 50000 protocol: TCP -- namespace: kube-system - name: traefik - type: LoadBalancer - selector: - app.kubernetes.io/instance: traefik-kube-system - app.kubernetes.io/name: traefik - ports: - - name: web - port: 80 - targetPort: web - protocol: TCP - - name: websecure - port: 443 - targetPort: websecure - protocol: TCP - namespace: logging name: oauth2-proxy-logs type: ClusterIP @@ -1191,15 +1447,15 @@ services: port: 4190 targetPort: 4190 protocol: TCP -- namespace: mailu-mailserver - name: mailu-sync-listener +- namespace: maintenance + name: ariadne type: ClusterIP selector: - app: mailu-sync-listener + app: ariadne ports: - name: http - port: 8080 - targetPort: 8080 + port: 80 + targetPort: http protocol: TCP - namespace: monitoring name: dcgm-exporter @@ -1291,6 +1547,10 @@ services: port: 5432 targetPort: 5432 protocol: TCP + - name: metrics + port: 9187 + targetPort: 9187 + protocol: TCP - namespace: sso name: keycloak type: ClusterIP @@ -1335,6 +1595,20 @@ services: port: 8429 targetPort: 8429 protocol: TCP +- namespace: traefik + name: traefik + type: LoadBalancer + selector: + app: traefik + ports: + - name: web + port: 80 + targetPort: web + protocol: TCP + - name: websecure + port: 443 + targetPort: websecure + protocol: TCP - namespace: traefik name: traefik-metrics type: ClusterIP @@ -1447,6 +1721,19 @@ http_endpoints: kind: Ingress name: bstein-dev-home source: bstein-dev-home +- host: budget.bstein.dev + path: / + backend: + namespace: finance + service: actual-budget + port: 80 + workloads: + - kind: Deployment + name: actual-budget + via: + kind: Ingress + name: actual-budget + source: finance - host: call.live.bstein.dev path: / backend: @@ -1499,6 +1786,19 @@ http_endpoints: kind: Ingress name: nextcloud source: nextcloud +- host: health.bstein.dev + path: / + backend: + namespace: health + service: wger + port: 80 + workloads: + - kind: Deployment + name: wger + via: + kind: Ingress + name: wger + source: health - host: kit.live.bstein.dev path: /livekit/jwt backend: @@ -1558,6 +1858,65 @@ http_endpoints: kind: Ingress name: matrix-routing source: comms +- host: live.bstein.dev + path: /_matrix/client/r0/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: &id003 + - kind: Deployment + name: matrix-guest-register + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/login + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: &id002 + - kind: Deployment + name: matrix-authentication-service + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/logout + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/refresh + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: comms - host: logs.bstein.dev path: / backend: @@ -1601,9 +1960,7 @@ http_endpoints: namespace: comms service: matrix-authentication-service port: 8080 - workloads: &id002 - - kind: Deployment - name: matrix-authentication-service + workloads: *id002 via: kind: Ingress name: matrix-routing @@ -1647,9 +2004,7 @@ http_endpoints: namespace: comms service: matrix-guest-register port: 8080 - workloads: &id003 - - kind: Deployment - name: matrix-guest-register + workloads: *id003 via: kind: Ingress name: matrix-routing @@ -1722,6 +2077,19 @@ http_endpoints: kind: Ingress name: monerod source: monerod +- host: money.bstein.dev + path: / + backend: + namespace: finance + service: firefly + port: 80 + workloads: + - kind: Deployment + name: firefly + via: + kind: Ingress + name: firefly + source: finance - host: notes.bstein.dev path: / backend: @@ -1845,7 +2213,6 @@ helmrelease_host_hints: - live.bstein.dev - matrix.live.bstein.dev comms:comms/othrys-synapse: - - bstein.dev - kit.live.bstein.dev - live.bstein.dev - matrix.live.bstein.dev @@ -1856,6 +2223,8 @@ helmrelease_host_hints: - registry.bstein.dev logging:logging/data-prepper: - registry.bstein.dev + longhorn:longhorn-system/longhorn: + - registry.bstein.dev mailu:mailu-mailserver/mailu: - bstein.dev - mail.bstein.dev @@ -1863,5 +2232,8 @@ helmrelease_host_hints: - alerts.bstein.dev monitoring:monitoring/grafana: - bstein.dev + - mail.bstein.dev - metrics.bstein.dev - sso.bstein.dev + monitoring:monitoring/kube-state-metrics: + - atlas.bstein.dev diff --git a/knowledge/diagrams/atlas-http.mmd b/knowledge/diagrams/atlas-http.mmd index ab7c3621..1aa7ac80 100644 --- a/knowledge/diagrams/atlas-http.mmd +++ b/knowledge/diagrams/atlas-http.mmd @@ -17,6 +17,11 @@ flowchart LR host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"] svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend + host_budget_bstein_dev["budget.bstein.dev"] + svc_finance_actual_budget["finance/actual-budget (Service)"] + host_budget_bstein_dev --> svc_finance_actual_budget + wl_finance_actual_budget["finance/actual-budget (Deployment)"] + svc_finance_actual_budget --> wl_finance_actual_budget host_call_live_bstein_dev["call.live.bstein.dev"] svc_comms_element_call["comms/element-call (Service)"] host_call_live_bstein_dev --> svc_comms_element_call @@ -37,6 +42,11 @@ flowchart LR host_cloud_bstein_dev --> svc_nextcloud_nextcloud wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"] svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud + host_health_bstein_dev["health.bstein.dev"] + svc_health_wger["health/wger (Service)"] + host_health_bstein_dev --> svc_health_wger + wl_health_wger["health/wger (Deployment)"] + svc_health_wger --> wl_health_wger host_kit_live_bstein_dev["kit.live.bstein.dev"] svc_comms_livekit_token_service["comms/livekit-token-service (Service)"] host_kit_live_bstein_dev --> svc_comms_livekit_token_service @@ -50,6 +60,14 @@ flowchart LR host_live_bstein_dev --> svc_comms_matrix_wellknown svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"] host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] + host_live_bstein_dev --> svc_comms_matrix_guest_register + wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] + svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] + host_live_bstein_dev --> svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] + svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_logs_bstein_dev["logs.bstein.dev"] svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"] host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs @@ -64,21 +82,20 @@ flowchart LR svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"] host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front host_matrix_live_bstein_dev["matrix.live.bstein.dev"] - svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] - svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register - wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] - svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register host_monero_bstein_dev["monero.bstein.dev"] svc_crypto_monerod["crypto/monerod (Service)"] host_monero_bstein_dev --> svc_crypto_monerod wl_crypto_monerod["crypto/monerod (Deployment)"] svc_crypto_monerod --> wl_crypto_monerod + host_money_bstein_dev["money.bstein.dev"] + svc_finance_firefly["finance/firefly (Service)"] + host_money_bstein_dev --> svc_finance_firefly + wl_finance_firefly["finance/firefly (Deployment)"] + svc_finance_firefly --> wl_finance_firefly host_notes_bstein_dev["notes.bstein.dev"] svc_outline_outline["outline/outline (Service)"] host_notes_bstein_dev --> svc_outline_outline @@ -143,19 +160,29 @@ flowchart LR svc_comms_livekit wl_comms_livekit svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service svc_comms_matrix_guest_register wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service end subgraph crypto[crypto] svc_crypto_monerod wl_crypto_monerod end + subgraph finance[finance] + svc_finance_actual_budget + wl_finance_actual_budget + svc_finance_firefly + wl_finance_firefly + end subgraph gitea[gitea] svc_gitea_gitea wl_gitea_gitea end + subgraph health[health] + svc_health_wger + wl_health_wger + end subgraph jellyfin[jellyfin] svc_jellyfin_pegasus wl_jellyfin_pegasus diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py index c7f9f26f..34938e74 100644 --- a/scripts/knowledge_render_atlas.py +++ b/scripts/knowledge_render_atlas.py @@ -20,6 +20,7 @@ import subprocess import sys from dataclasses import dataclass from pathlib import Path +import shutil from typing import Any, Iterable import yaml @@ -60,6 +61,12 @@ def _run(cmd: list[str], *, cwd: Path) -> str: return res.stdout +def _sync_tree(source: Path, dest: Path) -> None: + if dest.exists(): + shutil.rmtree(dest) + shutil.copytree(source, dest) + + def kustomize_build(path: Path) -> str: rel = path.relative_to(REPO_ROOT) try: @@ -472,6 +479,11 @@ def main() -> int: action="store_true", help="Write generated files (otherwise just print a summary).", ) + ap.add_argument( + "--sync-comms", + action="store_true", + help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.", + ) args = ap.parse_args() out_dir = REPO_ROOT / args.out @@ -549,6 +561,11 @@ def main() -> int: print(f"Wrote {summary_path.relative_to(REPO_ROOT)}") print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}") print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}") + + if args.sync_comms: + comms_dir = REPO_ROOT / "services" / "comms" / "knowledge" + _sync_tree(out_dir, comms_dir) + print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}") return 0 diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7aedf4a0..70844ebf 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-8 + checksum/atlasbot-configmap: manual-atlasbot-9 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -73,6 +73,8 @@ spec: value: /kb - name: VM_URL value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 + - name: ARIADNE_STATE_URL + value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state - name: BOT_USER value: atlasbot - name: BOT_MENTIONS diff --git a/services/comms/knowledge/catalog/atlas-summary.json b/services/comms/knowledge/catalog/atlas-summary.json index fa350516..ea825ce7 100644 --- a/services/comms/knowledge/catalog/atlas-summary.json +++ b/services/comms/knowledge/catalog/atlas-summary.json @@ -1,8 +1,8 @@ { "counts": { - "helmrelease_host_hints": 17, - "http_endpoints": 37, - "services": 43, - "workloads": 54 + "helmrelease_host_hints": 19, + "http_endpoints": 45, + "services": 47, + "workloads": 74 } } diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json index 18cb6b64..21ac4073 100644 --- a/services/comms/knowledge/catalog/atlas.json +++ b/services/comms/knowledge/catalog/atlas.json @@ -11,6 +11,21 @@ "path": "services/bstein-dev-home", "targetNamespace": "bstein-dev-home" }, + { + "name": "bstein-dev-home-migrations", + "path": "services/bstein-dev-home/migrations", + "targetNamespace": "bstein-dev-home" + }, + { + "name": "cert-manager", + "path": "infrastructure/cert-manager", + "targetNamespace": "cert-manager" + }, + { + "name": "cert-manager-cleanup", + "path": "infrastructure/cert-manager/cleanup", + "targetNamespace": "cert-manager" + }, { "name": "comms", "path": "services/comms", @@ -26,6 +41,11 @@ "path": "services/crypto", "targetNamespace": "crypto" }, + { + "name": "finance", + "path": "services/finance", + "targetNamespace": "finance" + }, { "name": "flux-system", "path": "clusters/atlas/flux-system", @@ -46,6 +66,11 @@ "path": "services/harbor", "targetNamespace": "harbor" }, + { + "name": "health", + "path": "services/health", + "targetNamespace": "health" + }, { "name": "helm", "path": "infrastructure/sources/helm", @@ -71,6 +96,16 @@ "path": "services/logging", "targetNamespace": null }, + { + "name": "longhorn", + "path": "infrastructure/longhorn/core", + "targetNamespace": "longhorn-system" + }, + { + "name": "longhorn-adopt", + "path": "infrastructure/longhorn/adopt", + "targetNamespace": "longhorn-system" + }, { "name": "longhorn-ui", "path": "infrastructure/longhorn/ui-ingress", @@ -161,11 +196,21 @@ "path": "infrastructure/vault-csi", "targetNamespace": "kube-system" }, + { + "name": "vault-injector", + "path": "infrastructure/vault-injector", + "targetNamespace": "vault" + }, { "name": "vaultwarden", "path": "services/vaultwarden", "targetNamespace": "vaultwarden" }, + { + "name": "wallet-monero-temp", + "path": "services/crypto/wallet-monero-temp", + "targetNamespace": "crypto" + }, { "name": "xmr-miner", "path": "services/crypto/xmr-miner", @@ -199,7 +244,7 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157" ] }, { @@ -215,7 +260,20 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92" + "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157" + ] + }, + { + "kind": "Deployment", + "namespace": "bstein-dev-home", + "name": "bstein-dev-home-vault-sync", + "labels": { + "app": "bstein-dev-home-vault-sync" + }, + "serviceAccountName": "bstein-dev-home-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -225,7 +283,7 @@ "labels": { "app": "chat-ai-gateway" }, - "serviceAccountName": null, + "serviceAccountName": "bstein-dev-home", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -249,6 +307,19 @@ "python:3.11-slim" ] }, + { + "kind": "Deployment", + "namespace": "comms", + "name": "comms-vault-sync", + "labels": { + "app": "comms-vault-sync" + }, + "serviceAccountName": "comms-vault", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "comms", @@ -256,7 +327,7 @@ "labels": { "app": "coturn" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -286,7 +357,7 @@ "labels": { "app": "livekit" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -301,12 +372,12 @@ "labels": { "app": "livekit-token-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, "images": [ - "ghcr.io/element-hq/lk-jwt-service:0.3.0" + "registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0" ] }, { @@ -316,7 +387,7 @@ "labels": { "app": "matrix-authentication-service" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -331,7 +402,7 @@ "labels": { "app.kubernetes.io/name": "matrix-guest-register" }, - "serviceAccountName": null, + "serviceAccountName": "comms-vault", "nodeSelector": {}, "images": [ "python:3.11-slim" @@ -365,6 +436,19 @@ "ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "crypto-vault-sync", + "labels": { + "app": "crypto-vault-sync" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "crypto", @@ -372,7 +456,7 @@ "labels": { "app": "monero-p2pool" }, - "serviceAccountName": null, + "serviceAccountName": "crypto-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -395,6 +479,53 @@ "registry.bstein.dev/crypto/monerod:0.18.4.1" ] }, + { + "kind": "Deployment", + "namespace": "crypto", + "name": "wallet-monero-temp", + "labels": { + "app": "wallet-monero-temp" + }, + "serviceAccountName": "crypto-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "actual-budget", + "labels": { + "app": "actual-budget" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d" + ] + }, + { + "kind": "Deployment", + "namespace": "finance", + "name": "firefly", + "labels": { + "app": "firefly" + }, + "serviceAccountName": "finance-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "fireflyiii/core:version-6.4.15" + ] + }, { "kind": "Deployment", "namespace": "flux-system", @@ -516,7 +647,7 @@ "labels": { "app": "gitea" }, - "serviceAccountName": null, + "serviceAccountName": "gitea-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -524,6 +655,36 @@ "gitea/gitea:1.23" ] }, + { + "kind": "Deployment", + "namespace": "harbor", + "name": "harbor-vault-sync", + "labels": { + "app": "harbor-vault-sync" + }, + "serviceAccountName": "harbor-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "Deployment", + "namespace": "health", + "name": "wger", + "labels": { + "app": "wger" + }, + "serviceAccountName": "health-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10", + "wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5" + ] + }, { "kind": "Deployment", "namespace": "jellyfin", @@ -531,7 +692,7 @@ "labels": { "app": "jellyfin" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": {}, "images": [ "docker.io/jellyfin/jellyfin:10.11.5" @@ -544,14 +705,27 @@ "labels": { "app": "pegasus" }, - "serviceAccountName": null, + "serviceAccountName": "pegasus-vault-sync", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" }, "images": [ "alpine:3.20", - "registry.bstein.dev/streaming/pegasus:1.2.32" + "registry.bstein.dev/streaming/pegasus-vault:1.2.32" + ] + }, + { + "kind": "Deployment", + "namespace": "jellyfin", + "name": "pegasus-vault-sync", + "labels": { + "app": "pegasus-vault-sync" + }, + "serviceAccountName": "pegasus-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -570,6 +744,35 @@ "jenkins/jenkins:2.528.3-jdk21" ] }, + { + "kind": "Deployment", + "namespace": "jenkins", + "name": "jenkins-vault-sync", + "labels": { + "app": "jenkins-vault-sync" + }, + "serviceAccountName": "jenkins-vault-sync", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "kube-system", + "name": "ntp-sync", + "labels": { + "app": "ntp-sync" + }, + "serviceAccountName": null, + "nodeSelector": {}, + "images": [ + "public.ecr.aws/docker/library/busybox:1.36.1" + ] + }, { "kind": "DaemonSet", "namespace": "kube-system", @@ -636,6 +839,21 @@ "hashicorp/vault-csi-provider:1.7.0" ] }, + { + "kind": "Deployment", + "namespace": "kube-system", + "name": "coredns", + "labels": { + "k8s-app": "kube-dns" + }, + "serviceAccountName": "coredns", + "nodeSelector": { + "kubernetes.io/os": "linux" + }, + "images": [ + "registry.bstein.dev/infra/coredns:1.12.1" + ] + }, { "kind": "DaemonSet", "namespace": "logging", @@ -681,6 +899,19 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "logging", + "name": "logging-vault-sync", + "labels": { + "app": "logging-vault-sync" + }, + "serviceAccountName": "logging-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "logging", @@ -688,12 +919,27 @@ "labels": { "app": "oauth2-proxy-logs" }, - "serviceAccountName": null, + "serviceAccountName": "logging-vault-sync", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "longhorn-system", + "name": "longhorn-vault-sync", + "labels": { + "app": "longhorn-vault-sync" + }, + "serviceAccountName": "longhorn-vault-sync", + "nodeSelector": { + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "alpine:3.20" ] }, { @@ -703,7 +949,7 @@ "labels": { "app": "oauth2-proxy-longhorn" }, - "serviceAccountName": null, + "serviceAccountName": "longhorn-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -729,14 +975,45 @@ { "kind": "Deployment", "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "name": "mailu-vault-sync", "labels": { - "app": "mailu-sync-listener" + "app": "mailu-vault-sync" }, - "serviceAccountName": null, + "serviceAccountName": "mailu-vault-sync", "nodeSelector": {}, "images": [ - "python:3.11-alpine" + "alpine:3.20" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "disable-k3s-traefik", + "labels": { + "app": "disable-k3s-traefik" + }, + "serviceAccountName": "disable-k3s-traefik", + "nodeSelector": { + "node-role.kubernetes.io/control-plane": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" + ] + }, + { + "kind": "DaemonSet", + "namespace": "maintenance", + "name": "k3s-agent-restart", + "labels": { + "app": "k3s-agent-restart" + }, + "serviceAccountName": "node-nofile", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, { @@ -767,6 +1044,35 @@ "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131" ] }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "ariadne", + "labels": { + "app": "ariadne" + }, + "serviceAccountName": "ariadne", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, + "images": [ + "registry.bstein.dev/bstein/ariadne:0.1.0-48" + ] + }, + { + "kind": "Deployment", + "namespace": "maintenance", + "name": "maintenance-vault-sync", + "labels": { + "app": "maintenance-vault-sync" + }, + "serviceAccountName": "maintenance-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "DaemonSet", "namespace": "monitoring", @@ -795,6 +1101,19 @@ "python:3.10-slim" ] }, + { + "kind": "Deployment", + "namespace": "monitoring", + "name": "monitoring-vault-sync", + "labels": { + "app": "monitoring-vault-sync" + }, + "serviceAccountName": "monitoring-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" + ] + }, { "kind": "Deployment", "namespace": "monitoring", @@ -802,7 +1121,7 @@ "labels": { "app": "postmark-exporter" }, - "serviceAccountName": null, + "serviceAccountName": "monitoring-vault-sync", "nodeSelector": {}, "images": [ "python:3.12-alpine" @@ -830,7 +1149,7 @@ "labels": { "app": "nextcloud" }, - "serviceAccountName": null, + "serviceAccountName": "nextcloud-vault", "nodeSelector": { "hardware": "rpi5" }, @@ -845,7 +1164,7 @@ "labels": { "app": "outline" }, - "serviceAccountName": null, + "serviceAccountName": "outline-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -875,7 +1194,7 @@ "labels": { "app": "planka" }, - "serviceAccountName": null, + "serviceAccountName": "planka-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, @@ -895,7 +1214,8 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "postgres:15" + "postgres:15", + "quay.io/prometheuscommunity/postgres-exporter:v0.15.0" ] }, { @@ -905,8 +1225,11 @@ "labels": { "app": "keycloak" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "sso-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ "quay.io/keycloak/keycloak:26.0.7" ] @@ -918,12 +1241,25 @@ "labels": { "app": "oauth2-proxy" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "node-role.kubernetes.io/worker": "true" }, "images": [ - "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0" + "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0" + ] + }, + { + "kind": "Deployment", + "namespace": "sso", + "name": "sso-vault-sync", + "labels": { + "app": "sso-vault-sync" + }, + "serviceAccountName": "sso-vault-sync", + "nodeSelector": {}, + "images": [ + "alpine:3.20" ] }, { @@ -933,7 +1269,7 @@ "labels": { "app": "openldap" }, - "serviceAccountName": null, + "serviceAccountName": "sso-vault", "nodeSelector": { "kubernetes.io/arch": "arm64", "node-role.kubernetes.io/worker": "true" @@ -951,7 +1287,7 @@ }, "serviceAccountName": "sui-metrics", "nodeSelector": { - "kubernetes.io/hostname": "titan-24" + "hardware": "rpi5" }, "images": [ "victoriametrics/vmagent:v1.103.0" @@ -962,7 +1298,9 @@ "namespace": "traefik", "name": "traefik", "labels": { - "app": "traefik" + "app": "traefik", + "app.kubernetes.io/instance": "traefik-kube-system", + "app.kubernetes.io/name": "traefik" }, "serviceAccountName": "traefik-ingress-controller", "nodeSelector": { @@ -995,8 +1333,11 @@ "labels": { "app": "vaultwarden" }, - "serviceAccountName": null, - "nodeSelector": {}, + "serviceAccountName": "vaultwarden-vault", + "nodeSelector": { + "kubernetes.io/arch": "arm64", + "node-role.kubernetes.io/worker": "true" + }, "images": [ "vaultwarden/server:1.35.2" ] @@ -1565,6 +1906,54 @@ } ] }, + { + "namespace": "crypto", + "name": "wallet-monero-temp", + "type": "ClusterIP", + "selector": { + "app": "wallet-monero-temp" + }, + "ports": [ + { + "name": "rpc", + "port": 18083, + "targetPort": 18083, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "actual-budget", + "type": "ClusterIP", + "selector": { + "app": "actual-budget" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 5006, + "protocol": "TCP" + } + ] + }, + { + "namespace": "finance", + "name": "firefly", + "type": "ClusterIP", + "selector": { + "app": "firefly" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": 8080, + "protocol": "TCP" + } + ] + }, { "namespace": "flux-system", "name": "notification-controller", @@ -1632,7 +2021,7 @@ { "namespace": "gitea", "name": "gitea-ssh", - "type": "NodePort", + "type": "LoadBalancer", "selector": { "app": "gitea" }, @@ -1645,6 +2034,22 @@ } ] }, + { + "namespace": "health", + "name": "wger", + "type": "ClusterIP", + "selector": { + "app": "wger" + }, + "ports": [ + { + "name": "http", + "port": 80, + "targetPort": "http", + "protocol": "TCP" + } + ] + }, { "namespace": "jellyfin", "name": "jellyfin", @@ -1699,29 +2104,6 @@ } ] }, - { - "namespace": "kube-system", - "name": "traefik", - "type": "LoadBalancer", - "selector": { - "app.kubernetes.io/instance": "traefik-kube-system", - "app.kubernetes.io/name": "traefik" - }, - "ports": [ - { - "name": "web", - "port": 80, - "targetPort": "web", - "protocol": "TCP" - }, - { - "name": "websecure", - "port": 443, - "targetPort": "websecure", - "protocol": "TCP" - } - ] - }, { "namespace": "logging", "name": "oauth2-proxy-logs", @@ -1803,17 +2185,17 @@ ] }, { - "namespace": "mailu-mailserver", - "name": "mailu-sync-listener", + "namespace": "maintenance", + "name": "ariadne", "type": "ClusterIP", "selector": { - "app": "mailu-sync-listener" + "app": "ariadne" }, "ports": [ { "name": "http", - "port": 8080, - "targetPort": 8080, + "port": 80, + "targetPort": "http", "protocol": "TCP" } ] @@ -1959,6 +2341,12 @@ "port": 5432, "targetPort": 5432, "protocol": "TCP" + }, + { + "name": "metrics", + "port": 9187, + "targetPort": 9187, + "protocol": "TCP" } ] }, @@ -2032,6 +2420,28 @@ } ] }, + { + "namespace": "traefik", + "name": "traefik", + "type": "LoadBalancer", + "selector": { + "app": "traefik" + }, + "ports": [ + { + "name": "web", + "port": 80, + "targetPort": "web", + "protocol": "TCP" + }, + { + "name": "websecure", + "port": 443, + "targetPort": "websecure", + "protocol": "TCP" + } + ] + }, { "namespace": "traefik", "name": "traefik-metrics", @@ -2210,6 +2620,26 @@ "source": "bstein-dev-home" } }, + { + "host": "budget.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "actual-budget", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "actual-budget" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "actual-budget", + "source": "finance" + } + }, { "host": "call.live.bstein.dev", "path": "/", @@ -2290,6 +2720,26 @@ "source": "nextcloud" } }, + { + "host": "health.bstein.dev", + "path": "/", + "backend": { + "namespace": "health", + "service": "wger", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "wger" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "wger", + "source": "health" + } + }, { "host": "kit.live.bstein.dev", "path": "/livekit/jwt", @@ -2385,6 +2835,106 @@ "source": "comms" } }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/r0/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/login", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/logout", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/refresh", + "backend": { + "namespace": "comms", + "service": "matrix-authentication-service", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-authentication-service" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, + { + "host": "live.bstein.dev", + "path": "/_matrix/client/v3/register", + "backend": { + "namespace": "comms", + "service": "matrix-guest-register", + "port": 8080, + "workloads": [ + { + "kind": "Deployment", + "name": "matrix-guest-register" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "matrix-routing", + "source": "comms" + } + }, { "host": "logs.bstein.dev", "path": "/", @@ -2650,6 +3200,26 @@ "source": "monerod" } }, + { + "host": "money.bstein.dev", + "path": "/", + "backend": { + "namespace": "finance", + "service": "firefly", + "port": 80, + "workloads": [ + { + "kind": "Deployment", + "name": "firefly" + } + ] + }, + "via": { + "kind": "Ingress", + "name": "firefly", + "source": "finance" + } + }, { "host": "notes.bstein.dev", "path": "/", @@ -2838,7 +3408,6 @@ "matrix.live.bstein.dev" ], "comms:comms/othrys-synapse": [ - "bstein.dev", "kit.live.bstein.dev", "live.bstein.dev", "matrix.live.bstein.dev", @@ -2853,6 +3422,9 @@ "logging:logging/data-prepper": [ "registry.bstein.dev" ], + "longhorn:longhorn-system/longhorn": [ + "registry.bstein.dev" + ], "mailu:mailu-mailserver/mailu": [ "bstein.dev", "mail.bstein.dev" @@ -2862,8 +3434,12 @@ ], "monitoring:monitoring/grafana": [ "bstein.dev", + "mail.bstein.dev", "metrics.bstein.dev", "sso.bstein.dev" + ], + "monitoring:monitoring/kube-state-metrics": [ + "atlas.bstein.dev" ] } } diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml index 67f2fcb2..b3b0119f 100644 --- a/services/comms/knowledge/catalog/atlas.yaml +++ b/services/comms/knowledge/catalog/atlas.yaml @@ -1,4 +1,4 @@ -# services/comms/knowledge/catalog/atlas.yaml +# knowledge/catalog/atlas.yaml # Generated by scripts/knowledge_render_atlas.py (do not edit by hand) cluster: atlas sources: @@ -8,6 +8,15 @@ sources: - name: bstein-dev-home path: services/bstein-dev-home targetNamespace: bstein-dev-home +- name: bstein-dev-home-migrations + path: services/bstein-dev-home/migrations + targetNamespace: bstein-dev-home +- name: cert-manager + path: infrastructure/cert-manager + targetNamespace: cert-manager +- name: cert-manager-cleanup + path: infrastructure/cert-manager/cleanup + targetNamespace: cert-manager - name: comms path: services/comms targetNamespace: comms @@ -17,6 +26,9 @@ sources: - name: crypto path: services/crypto targetNamespace: crypto +- name: finance + path: services/finance + targetNamespace: finance - name: flux-system path: clusters/atlas/flux-system targetNamespace: null @@ -29,6 +41,9 @@ sources: - name: harbor path: services/harbor targetNamespace: harbor +- name: health + path: services/health + targetNamespace: health - name: helm path: infrastructure/sources/helm targetNamespace: flux-system @@ -44,6 +59,12 @@ sources: - name: logging path: services/logging targetNamespace: null +- name: longhorn + path: infrastructure/longhorn/core + targetNamespace: longhorn-system +- name: longhorn-adopt + path: infrastructure/longhorn/adopt + targetNamespace: longhorn-system - name: longhorn-ui path: infrastructure/longhorn/ui-ingress targetNamespace: longhorn-system @@ -98,9 +119,15 @@ sources: - name: vault-csi path: infrastructure/vault-csi targetNamespace: kube-system +- name: vault-injector + path: infrastructure/vault-injector + targetNamespace: vault - name: vaultwarden path: services/vaultwarden targetNamespace: vaultwarden +- name: wallet-monero-temp + path: services/crypto/wallet-monero-temp + targetNamespace: crypto - name: xmr-miner path: services/crypto/xmr-miner targetNamespace: crypto @@ -124,7 +151,7 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157 - kind: Deployment namespace: bstein-dev-home name: bstein-dev-home-frontend @@ -135,13 +162,22 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 + - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157 +- kind: Deployment + namespace: bstein-dev-home + name: bstein-dev-home-vault-sync + labels: + app: bstein-dev-home-vault-sync + serviceAccountName: bstein-dev-home-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: bstein-dev-home name: chat-ai-gateway labels: app: chat-ai-gateway - serviceAccountName: null + serviceAccountName: bstein-dev-home nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -157,12 +193,21 @@ workloads: hardware: rpi5 images: - python:3.11-slim +- kind: Deployment + namespace: comms + name: comms-vault-sync + labels: + app: comms-vault-sync + serviceAccountName: comms-vault + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: comms name: coturn labels: app: coturn - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -182,7 +227,7 @@ workloads: name: livekit labels: app: livekit - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -192,17 +237,17 @@ workloads: name: livekit-token-service labels: app: livekit-token-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: - - ghcr.io/element-hq/lk-jwt-service:0.3.0 + - registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0 - kind: Deployment namespace: comms name: matrix-authentication-service labels: app: matrix-authentication-service - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: hardware: rpi5 images: @@ -212,7 +257,7 @@ workloads: name: matrix-guest-register labels: app.kubernetes.io/name: matrix-guest-register - serviceAccountName: null + serviceAccountName: comms-vault nodeSelector: {} images: - python:3.11-slim @@ -235,12 +280,21 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9 +- kind: Deployment + namespace: crypto + name: crypto-vault-sync + labels: + app: crypto-vault-sync + serviceAccountName: crypto-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: crypto name: monero-p2pool labels: app: monero-p2pool - serviceAccountName: null + serviceAccountName: crypto-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -255,6 +309,38 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - registry.bstein.dev/crypto/monerod:0.18.4.1 +- kind: Deployment + namespace: crypto + name: wallet-monero-temp + labels: + app: wallet-monero-temp + serviceAccountName: crypto-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1 +- kind: Deployment + namespace: finance + name: actual-budget + labels: + app: actual-budget + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d +- kind: Deployment + namespace: finance + name: firefly + labels: + app: firefly + serviceAccountName: finance-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - fireflyiii/core:version-6.4.15 - kind: Deployment namespace: flux-system name: helm-controller @@ -344,17 +430,38 @@ workloads: name: gitea labels: app: gitea - serviceAccountName: null + serviceAccountName: gitea-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - gitea/gitea:1.23 +- kind: Deployment + namespace: harbor + name: harbor-vault-sync + labels: + app: harbor-vault-sync + serviceAccountName: harbor-vault-sync + nodeSelector: {} + images: + - alpine:3.20 +- kind: Deployment + namespace: health + name: wger + labels: + app: wger + serviceAccountName: health-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10 + - wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5 - kind: Deployment namespace: jellyfin name: jellyfin labels: app: jellyfin - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: {} images: - docker.io/jellyfin/jellyfin:10.11.5 @@ -363,13 +470,22 @@ workloads: name: pegasus labels: app: pegasus - serviceAccountName: null + serviceAccountName: pegasus-vault-sync nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - alpine:3.20 - - registry.bstein.dev/streaming/pegasus:1.2.32 + - registry.bstein.dev/streaming/pegasus-vault:1.2.32 +- kind: Deployment + namespace: jellyfin + name: pegasus-vault-sync + labels: + app: pegasus-vault-sync + serviceAccountName: pegasus-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: jenkins name: jenkins @@ -381,6 +497,26 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - jenkins/jenkins:2.528.3-jdk21 +- kind: Deployment + namespace: jenkins + name: jenkins-vault-sync + labels: + app: jenkins-vault-sync + serviceAccountName: jenkins-vault-sync + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 +- kind: DaemonSet + namespace: kube-system + name: ntp-sync + labels: + app: ntp-sync + serviceAccountName: null + nodeSelector: {} + images: + - public.ecr.aws/docker/library/busybox:1.36.1 - kind: DaemonSet namespace: kube-system name: nvidia-device-plugin-jetson @@ -427,6 +563,16 @@ workloads: kubernetes.io/os: linux images: - hashicorp/vault-csi-provider:1.7.0 +- kind: Deployment + namespace: kube-system + name: coredns + labels: + k8s-app: kube-dns + serviceAccountName: coredns + nodeSelector: + kubernetes.io/os: linux + images: + - registry.bstein.dev/infra/coredns:1.12.1 - kind: DaemonSet namespace: logging name: node-image-gc-rpi4 @@ -457,22 +603,41 @@ workloads: hardware: rpi5 images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: logging + name: logging-vault-sync + labels: + app: logging-vault-sync + serviceAccountName: logging-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: logging name: oauth2-proxy-logs labels: app: oauth2-proxy-logs - serviceAccountName: null + serviceAccountName: logging-vault-sync nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: longhorn-system + name: longhorn-vault-sync + labels: + app: longhorn-vault-sync + serviceAccountName: longhorn-vault-sync + nodeSelector: + node-role.kubernetes.io/worker: 'true' + images: + - alpine:3.20 - kind: Deployment namespace: longhorn-system name: oauth2-proxy-longhorn labels: app: oauth2-proxy-longhorn - serviceAccountName: null + serviceAccountName: longhorn-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -489,13 +654,34 @@ workloads: - registry.bstein.dev/bstein/kubectl:1.35.0 - kind: Deployment namespace: mailu-mailserver - name: mailu-sync-listener + name: mailu-vault-sync labels: - app: mailu-sync-listener - serviceAccountName: null + app: mailu-vault-sync + serviceAccountName: mailu-vault-sync nodeSelector: {} images: - - python:3.11-alpine + - alpine:3.20 +- kind: DaemonSet + namespace: maintenance + name: disable-k3s-traefik + labels: + app: disable-k3s-traefik + serviceAccountName: disable-k3s-traefik + nodeSelector: + node-role.kubernetes.io/control-plane: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: DaemonSet + namespace: maintenance + name: k3s-agent-restart + labels: + app: k3s-agent-restart + serviceAccountName: node-nofile + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 - kind: DaemonSet namespace: maintenance name: node-image-sweeper @@ -515,6 +701,26 @@ workloads: nodeSelector: {} images: - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 +- kind: Deployment + namespace: maintenance + name: ariadne + labels: + app: ariadne + serviceAccountName: ariadne + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' + images: + - registry.bstein.dev/bstein/ariadne:0.1.0-48 +- kind: Deployment + namespace: maintenance + name: maintenance-vault-sync + labels: + app: maintenance-vault-sync + serviceAccountName: maintenance-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: DaemonSet namespace: monitoring name: dcgm-exporter @@ -534,12 +740,21 @@ workloads: jetson: 'true' images: - python:3.10-slim +- kind: Deployment + namespace: monitoring + name: monitoring-vault-sync + labels: + app: monitoring-vault-sync + serviceAccountName: monitoring-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: Deployment namespace: monitoring name: postmark-exporter labels: app: postmark-exporter - serviceAccountName: null + serviceAccountName: monitoring-vault-sync nodeSelector: {} images: - python:3.12-alpine @@ -558,7 +773,7 @@ workloads: name: nextcloud labels: app: nextcloud - serviceAccountName: null + serviceAccountName: nextcloud-vault nodeSelector: hardware: rpi5 images: @@ -568,7 +783,7 @@ workloads: name: outline labels: app: outline - serviceAccountName: null + serviceAccountName: outline-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -588,7 +803,7 @@ workloads: name: planka labels: app: planka - serviceAccountName: null + serviceAccountName: planka-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: @@ -603,13 +818,16 @@ workloads: node-role.kubernetes.io/worker: 'true' images: - postgres:15 + - quay.io/prometheuscommunity/postgres-exporter:v0.15.0 - kind: Deployment namespace: sso name: keycloak labels: app: keycloak - serviceAccountName: null - nodeSelector: {} + serviceAccountName: sso-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - quay.io/keycloak/keycloak:26.0.7 - kind: Deployment @@ -617,17 +835,26 @@ workloads: name: oauth2-proxy labels: app: oauth2-proxy - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: node-role.kubernetes.io/worker: 'true' images: - - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 + - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0 +- kind: Deployment + namespace: sso + name: sso-vault-sync + labels: + app: sso-vault-sync + serviceAccountName: sso-vault-sync + nodeSelector: {} + images: + - alpine:3.20 - kind: StatefulSet namespace: sso name: openldap labels: app: openldap - serviceAccountName: null + serviceAccountName: sso-vault nodeSelector: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' @@ -640,7 +867,7 @@ workloads: app: sui-metrics serviceAccountName: sui-metrics nodeSelector: - kubernetes.io/hostname: titan-24 + hardware: rpi5 images: - victoriametrics/vmagent:v1.103.0 - kind: Deployment @@ -648,6 +875,8 @@ workloads: name: traefik labels: app: traefik + app.kubernetes.io/instance: traefik-kube-system + app.kubernetes.io/name: traefik serviceAccountName: traefik-ingress-controller nodeSelector: node-role.kubernetes.io/worker: 'true' @@ -669,8 +898,10 @@ workloads: name: vaultwarden labels: app: vaultwarden - serviceAccountName: null - nodeSelector: {} + serviceAccountName: vaultwarden-vault + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/worker: 'true' images: - vaultwarden/server:1.35.2 services: @@ -1040,6 +1271,36 @@ services: port: 3333 targetPort: 3333 protocol: TCP +- namespace: crypto + name: wallet-monero-temp + type: ClusterIP + selector: + app: wallet-monero-temp + ports: + - name: rpc + port: 18083 + targetPort: 18083 + protocol: TCP +- namespace: finance + name: actual-budget + type: ClusterIP + selector: + app: actual-budget + ports: + - name: http + port: 80 + targetPort: 5006 + protocol: TCP +- namespace: finance + name: firefly + type: ClusterIP + selector: + app: firefly + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP - namespace: flux-system name: notification-controller type: ClusterIP @@ -1082,7 +1343,7 @@ services: protocol: TCP - namespace: gitea name: gitea-ssh - type: NodePort + type: LoadBalancer selector: app: gitea ports: @@ -1090,6 +1351,16 @@ services: port: 2242 targetPort: 2242 protocol: TCP +- namespace: health + name: wger + type: ClusterIP + selector: + app: wger + ports: + - name: http + port: 80 + targetPort: http + protocol: TCP - namespace: jellyfin name: jellyfin type: ClusterIP @@ -1124,21 +1395,6 @@ services: port: 50000 targetPort: 50000 protocol: TCP -- namespace: kube-system - name: traefik - type: LoadBalancer - selector: - app.kubernetes.io/instance: traefik-kube-system - app.kubernetes.io/name: traefik - ports: - - name: web - port: 80 - targetPort: web - protocol: TCP - - name: websecure - port: 443 - targetPort: websecure - protocol: TCP - namespace: logging name: oauth2-proxy-logs type: ClusterIP @@ -1191,15 +1447,15 @@ services: port: 4190 targetPort: 4190 protocol: TCP -- namespace: mailu-mailserver - name: mailu-sync-listener +- namespace: maintenance + name: ariadne type: ClusterIP selector: - app: mailu-sync-listener + app: ariadne ports: - name: http - port: 8080 - targetPort: 8080 + port: 80 + targetPort: http protocol: TCP - namespace: monitoring name: dcgm-exporter @@ -1291,6 +1547,10 @@ services: port: 5432 targetPort: 5432 protocol: TCP + - name: metrics + port: 9187 + targetPort: 9187 + protocol: TCP - namespace: sso name: keycloak type: ClusterIP @@ -1335,6 +1595,20 @@ services: port: 8429 targetPort: 8429 protocol: TCP +- namespace: traefik + name: traefik + type: LoadBalancer + selector: + app: traefik + ports: + - name: web + port: 80 + targetPort: web + protocol: TCP + - name: websecure + port: 443 + targetPort: websecure + protocol: TCP - namespace: traefik name: traefik-metrics type: ClusterIP @@ -1447,6 +1721,19 @@ http_endpoints: kind: Ingress name: bstein-dev-home source: bstein-dev-home +- host: budget.bstein.dev + path: / + backend: + namespace: finance + service: actual-budget + port: 80 + workloads: + - kind: Deployment + name: actual-budget + via: + kind: Ingress + name: actual-budget + source: finance - host: call.live.bstein.dev path: / backend: @@ -1499,6 +1786,19 @@ http_endpoints: kind: Ingress name: nextcloud source: nextcloud +- host: health.bstein.dev + path: / + backend: + namespace: health + service: wger + port: 80 + workloads: + - kind: Deployment + name: wger + via: + kind: Ingress + name: wger + source: health - host: kit.live.bstein.dev path: /livekit/jwt backend: @@ -1558,6 +1858,65 @@ http_endpoints: kind: Ingress name: matrix-routing source: comms +- host: live.bstein.dev + path: /_matrix/client/r0/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: &id003 + - kind: Deployment + name: matrix-guest-register + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/login + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: &id002 + - kind: Deployment + name: matrix-authentication-service + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/logout + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/refresh + backend: + namespace: comms + service: matrix-authentication-service + port: 8080 + workloads: *id002 + via: + kind: Ingress + name: matrix-routing + source: comms +- host: live.bstein.dev + path: /_matrix/client/v3/register + backend: + namespace: comms + service: matrix-guest-register + port: 8080 + workloads: *id003 + via: + kind: Ingress + name: matrix-routing + source: comms - host: logs.bstein.dev path: / backend: @@ -1601,9 +1960,7 @@ http_endpoints: namespace: comms service: matrix-authentication-service port: 8080 - workloads: &id002 - - kind: Deployment - name: matrix-authentication-service + workloads: *id002 via: kind: Ingress name: matrix-routing @@ -1647,9 +2004,7 @@ http_endpoints: namespace: comms service: matrix-guest-register port: 8080 - workloads: &id003 - - kind: Deployment - name: matrix-guest-register + workloads: *id003 via: kind: Ingress name: matrix-routing @@ -1722,6 +2077,19 @@ http_endpoints: kind: Ingress name: monerod source: monerod +- host: money.bstein.dev + path: / + backend: + namespace: finance + service: firefly + port: 80 + workloads: + - kind: Deployment + name: firefly + via: + kind: Ingress + name: firefly + source: finance - host: notes.bstein.dev path: / backend: @@ -1845,7 +2213,6 @@ helmrelease_host_hints: - live.bstein.dev - matrix.live.bstein.dev comms:comms/othrys-synapse: - - bstein.dev - kit.live.bstein.dev - live.bstein.dev - matrix.live.bstein.dev @@ -1856,6 +2223,8 @@ helmrelease_host_hints: - registry.bstein.dev logging:logging/data-prepper: - registry.bstein.dev + longhorn:longhorn-system/longhorn: + - registry.bstein.dev mailu:mailu-mailserver/mailu: - bstein.dev - mail.bstein.dev @@ -1863,5 +2232,8 @@ helmrelease_host_hints: - alerts.bstein.dev monitoring:monitoring/grafana: - bstein.dev + - mail.bstein.dev - metrics.bstein.dev - sso.bstein.dev + monitoring:monitoring/kube-state-metrics: + - atlas.bstein.dev diff --git a/services/comms/knowledge/catalog/runbooks.json b/services/comms/knowledge/catalog/runbooks.json index d7356ca5..0718562b 100644 --- a/services/comms/knowledge/catalog/runbooks.json +++ b/services/comms/knowledge/catalog/runbooks.json @@ -20,6 +20,22 @@ ], "body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured." }, + { + "path": "runbooks/comms-verify.md", + "title": "Othrys verification checklist", + "tags": [ + "comms", + "matrix", + "element", + "livekit" + ], + "entrypoints": [ + "https://live.bstein.dev", + "https://matrix.live.bstein.dev" + ], + "source_paths": [], + "body": "1) Guest join:\n- Open a private window and visit:\n `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`\n- Confirm the guest join flow works and the displayname becomes `-`.\n\n2) Keycloak login:\n- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.\n\n3) Video rooms:\n- Start an Element Call room and confirm audio/video with a second account.\n- Check that guests can read public rooms but cannot start calls.\n\n4) Well-known:\n- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.\n- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.\n\n5) TURN reachability:\n- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN." + }, { "path": "runbooks/kb-authoring.md", "title": "KB authoring: what to write (and what not to)", diff --git a/services/comms/knowledge/diagrams/atlas-http.mmd b/services/comms/knowledge/diagrams/atlas-http.mmd index ab7c3621..1aa7ac80 100644 --- a/services/comms/knowledge/diagrams/atlas-http.mmd +++ b/services/comms/knowledge/diagrams/atlas-http.mmd @@ -17,6 +17,11 @@ flowchart LR host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"] svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend + host_budget_bstein_dev["budget.bstein.dev"] + svc_finance_actual_budget["finance/actual-budget (Service)"] + host_budget_bstein_dev --> svc_finance_actual_budget + wl_finance_actual_budget["finance/actual-budget (Deployment)"] + svc_finance_actual_budget --> wl_finance_actual_budget host_call_live_bstein_dev["call.live.bstein.dev"] svc_comms_element_call["comms/element-call (Service)"] host_call_live_bstein_dev --> svc_comms_element_call @@ -37,6 +42,11 @@ flowchart LR host_cloud_bstein_dev --> svc_nextcloud_nextcloud wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"] svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud + host_health_bstein_dev["health.bstein.dev"] + svc_health_wger["health/wger (Service)"] + host_health_bstein_dev --> svc_health_wger + wl_health_wger["health/wger (Deployment)"] + svc_health_wger --> wl_health_wger host_kit_live_bstein_dev["kit.live.bstein.dev"] svc_comms_livekit_token_service["comms/livekit-token-service (Service)"] host_kit_live_bstein_dev --> svc_comms_livekit_token_service @@ -50,6 +60,14 @@ flowchart LR host_live_bstein_dev --> svc_comms_matrix_wellknown svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"] host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse + svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] + host_live_bstein_dev --> svc_comms_matrix_guest_register + wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] + svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] + host_live_bstein_dev --> svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] + svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_logs_bstein_dev["logs.bstein.dev"] svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"] host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs @@ -64,21 +82,20 @@ flowchart LR svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"] host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front host_matrix_live_bstein_dev["matrix.live.bstein.dev"] - svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"] - svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"] host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register - wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"] - svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register host_monero_bstein_dev["monero.bstein.dev"] svc_crypto_monerod["crypto/monerod (Service)"] host_monero_bstein_dev --> svc_crypto_monerod wl_crypto_monerod["crypto/monerod (Deployment)"] svc_crypto_monerod --> wl_crypto_monerod + host_money_bstein_dev["money.bstein.dev"] + svc_finance_firefly["finance/firefly (Service)"] + host_money_bstein_dev --> svc_finance_firefly + wl_finance_firefly["finance/firefly (Deployment)"] + svc_finance_firefly --> wl_finance_firefly host_notes_bstein_dev["notes.bstein.dev"] svc_outline_outline["outline/outline (Service)"] host_notes_bstein_dev --> svc_outline_outline @@ -143,19 +160,29 @@ flowchart LR svc_comms_livekit wl_comms_livekit svc_comms_othrys_synapse_matrix_synapse - svc_comms_matrix_authentication_service - wl_comms_matrix_authentication_service svc_comms_matrix_guest_register wl_comms_matrix_guest_register + svc_comms_matrix_authentication_service + wl_comms_matrix_authentication_service end subgraph crypto[crypto] svc_crypto_monerod wl_crypto_monerod end + subgraph finance[finance] + svc_finance_actual_budget + wl_finance_actual_budget + svc_finance_firefly + wl_finance_firefly + end subgraph gitea[gitea] svc_gitea_gitea wl_gitea_gitea end + subgraph health[health] + svc_health_wger + wl_health_wger + end subgraph jellyfin[jellyfin] svc_jellyfin_pegasus wl_jellyfin_pegasus diff --git a/services/comms/knowledge/metis.md b/services/comms/knowledge/metis.md new file mode 100644 index 00000000..5b0d06be --- /dev/null +++ b/services/comms/knowledge/metis.md @@ -0,0 +1,26 @@ +# Metis (node recovery) + +## Node classes (current map) +- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent) +- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint) +- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks) +- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent) +- amd64 agents: titan-22/24 (Debian 13, k3s agent) +- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers. + +## Longhorn disk UUIDs (critical nodes) +- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4) +- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4) +- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4) +- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4) + +## Metis repo (~/Development/metis) +- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`). +- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints). +- `AGENTS.md` in repo is untracked and holds raw notes. + +## Next implementation steps +- Add per-class golden image refs and checksums (Harbor or file://) when ready. +- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths. +- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection. +- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited. diff --git a/services/comms/knowledge/runbooks/comms-verify.md b/services/comms/knowledge/runbooks/comms-verify.md new file mode 100644 index 00000000..8c09d0af --- /dev/null +++ b/services/comms/knowledge/runbooks/comms-verify.md @@ -0,0 +1,30 @@ +--- +title: Othrys verification checklist +tags: + - comms + - matrix + - element + - livekit +entrypoints: + - https://live.bstein.dev + - https://matrix.live.bstein.dev +--- + +1) Guest join: +- Open a private window and visit: + `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join` +- Confirm the guest join flow works and the displayname becomes `-`. + +2) Keycloak login: +- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect. + +3) Video rooms: +- Start an Element Call room and confirm audio/video with a second account. +- Check that guests can read public rooms but cannot start calls. + +4) Well-known: +- `https://live.bstein.dev/.well-known/matrix/client` returns JSON. +- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON. + +5) TURN reachability: +- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN. diff --git a/services/comms/knowledge/software/metis.md b/services/comms/knowledge/software/metis.md new file mode 100644 index 00000000..7ca3b399 --- /dev/null +++ b/services/comms/knowledge/software/metis.md @@ -0,0 +1,73 @@ +# Metis (node recovery) + +## Node classes (current map) +- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent) +- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint) +- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks) +- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent) +- amd64 agents: titan-22/24 (Debian 13, k3s agent) +- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers. + +### Jetson nodes (titan-20/21) +- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64. +- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused). +- k3s agent with drop-in 99-nofile.conf. + +## Longhorn disk UUIDs (critical nodes) +- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4) +- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4) +- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4) +- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4) + +## Metis repo (~/Development/metis) +- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`). +- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints). +- `AGENTS.md` in repo is untracked and holds raw notes. + +## Next implementation steps +- Add per-class golden image refs and checksums (Harbor or file://) when ready. +- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths. +- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection. +- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited. + +## Node OS/Kernel/CRI snapshot (Jan 2026) +- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64 +- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64 +- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64 +- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64 + + +### External hosts +- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled. +- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q). +- titan-23/oceanus: TODO audit (future). + + +### Control plane Pis (titan-0a/0b/0c) +- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2. +- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot. +- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO). + + +## k3s versions +- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2) +- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2) +- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2 diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6fb6bff0..e0776203 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -19,6 +19,8 @@ API_KEY = os.environ.get("CHAT_API_KEY", "") KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") +ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "") +ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "") BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas") SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") @@ -297,6 +299,21 @@ def k8s_get(path: str, timeout: int = 8) -> dict: raw = resp.read() return json.loads(raw.decode()) if raw else {} +def _ariadne_state(timeout: int = 5) -> dict | None: + if not ARIADNE_STATE_URL: + return None + headers = {} + if ARIADNE_STATE_TOKEN: + headers["X-Internal-Token"] = ARIADNE_STATE_TOKEN + r = request.Request(ARIADNE_STATE_URL, headers=headers, method="GET") + try: + with request.urlopen(r, timeout=timeout) as resp: + raw = resp.read() + payload = json.loads(raw.decode()) if raw else {} + return payload if isinstance(payload, dict) else None + except Exception: + return None + def k8s_pods(namespace: str) -> list[dict]: data = k8s_get(f"/api/v1/namespaces/{parse.quote(namespace)}/pods?limit=500") items = data.get("items") or [] @@ -445,6 +462,17 @@ def vm_cluster_snapshot() -> str: return "\n".join(parts).strip() def nodes_summary(cluster_name: str) -> str: + state = _ariadne_state() + if state: + nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {} + total = nodes.get("total") + ready = nodes.get("ready") + not_ready = nodes.get("not_ready") + if isinstance(total, int) and isinstance(ready, int): + not_ready = not_ready if isinstance(not_ready, int) else max(total - ready, 0) + if not_ready: + return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady." + return f"{cluster_name} cluster has {total} nodes, all Ready." try: data = k8s_get("/api/v1/nodes?limit=500") except Exception: @@ -467,6 +495,16 @@ def nodes_summary(cluster_name: str) -> str: return f"{cluster_name} cluster has {total} nodes, all Ready." def nodes_names_summary(cluster_name: str) -> str: + state = _ariadne_state() + if state: + nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {} + names = nodes.get("names") + if isinstance(names, list) and names: + cleaned = sorted({str(n) for n in names if n}) + if len(cleaned) <= 30: + return f"{cluster_name} node names: {', '.join(cleaned)}." + shown = ", ".join(cleaned[:30]) + return f"{cluster_name} node names: {shown}, … (+{len(cleaned) - 30} more)." try: data = k8s_get("/api/v1/nodes?limit=500") except Exception: diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 52d10f96..0356e060 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -311,10 +311,18 @@ spec: value: "0 0 1 1 *" - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM value: "*/10 * * * *" + - name: ARIADNE_SCHEDULE_CLUSTER_STATE + value: "*/15 * * * *" + - name: ARIADNE_CLUSTER_STATE_KEEP + value: "168" - name: WELCOME_EMAIL_ENABLED value: "true" - name: K8S_API_TIMEOUT_SEC value: "5" + - name: ARIADNE_VM_URL + value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 + - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC + value: "5" - name: OPENSEARCH_URL value: http://opensearch-master.logging.svc.cluster.local:9200 - name: OPENSEARCH_LIMIT_BYTES diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml index 88689cb6..33620d05 100644 --- a/services/maintenance/ariadne-rbac.yaml +++ b/services/maintenance/ariadne-rbac.yaml @@ -21,12 +21,27 @@ rules: - list - watch - delete + - apiGroups: [""] + resources: + - nodes + - namespaces + verbs: + - get + - list + - watch - apiGroups: [""] resources: - pods/exec verbs: - get - create + - apiGroups: ["kustomize.toolkit.fluxcd.io"] + resources: + - kustomizations + verbs: + - get + - list + - watch --- apiVersion: rbac.authorization.k8s.io/v1 From 733f420b9a3c1506b2e528265b65d6f3ff3b2a3f Mon Sep 17 00:00:00 2001 From: flux-bot Date: Mon, 26 Jan 2026 06:33:26 +0000 Subject: [PATCH 230/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 4e261cbf..3933caf6 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-48 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-49 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From a3b84a36fd525971197036545b285840c806a3a8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 09:23:21 -0300 Subject: [PATCH 231/416] comms: inject chat ai keys for atlasbot --- services/comms/atlasbot-deployment.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 70844ebf..aec7b790 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -25,6 +25,12 @@ spec: vault.hashicorp.com/agent-inject-secret-livekit-primary: "kv/data/atlas/comms/livekit-api" vault.hashicorp.com/agent-inject-template-livekit-primary: | {{- with secret "kv/data/atlas/comms/livekit-api" -}}{{ .Data.data.primary }}{{- end -}} + vault.hashicorp.com/agent-inject-secret-chat-matrix: "kv/data/atlas/shared/chat-ai-keys-runtime" + vault.hashicorp.com/agent-inject-template-chat-matrix: | + {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.matrix }}{{- end -}} + vault.hashicorp.com/agent-inject-secret-chat-homepage: "kv/data/atlas/shared/chat-ai-keys-runtime" + vault.hashicorp.com/agent-inject-template-chat-homepage: | + {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.homepage }}{{- end -}} vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-template-bot-pass: | {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}} From 80e059f6bb650653a92baf9a7c49e705dae64602 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 09:29:28 -0300 Subject: [PATCH 232/416] comms: fix duplicate chat key annotations --- services/comms/atlasbot-deployment.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index aec7b790..70844ebf 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -25,12 +25,6 @@ spec: vault.hashicorp.com/agent-inject-secret-livekit-primary: "kv/data/atlas/comms/livekit-api" vault.hashicorp.com/agent-inject-template-livekit-primary: | {{- with secret "kv/data/atlas/comms/livekit-api" -}}{{ .Data.data.primary }}{{- end -}} - vault.hashicorp.com/agent-inject-secret-chat-matrix: "kv/data/atlas/shared/chat-ai-keys-runtime" - vault.hashicorp.com/agent-inject-template-chat-matrix: | - {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.matrix }}{{- end -}} - vault.hashicorp.com/agent-inject-secret-chat-homepage: "kv/data/atlas/shared/chat-ai-keys-runtime" - vault.hashicorp.com/agent-inject-template-chat-homepage: | - {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.homepage }}{{- end -}} vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime" vault.hashicorp.com/agent-inject-template-bot-pass: | {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}} From 301fdb49173cc7fff04526bf318360aac679146b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 09:36:08 -0300 Subject: [PATCH 233/416] comms: handle arch node counts and extend LLM timeout --- services/comms/scripts/atlasbot/bot.py | 39 +++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e0776203..797b601d 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -16,6 +16,7 @@ ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") API_KEY = os.environ.get("CHAT_API_KEY", "") +OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "90")) KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") @@ -525,6 +526,29 @@ def nodes_names_summary(cluster_name: str) -> str: shown = ", ".join(names[:30]) return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)." + +def nodes_arch_summary(cluster_name: str, arch: str) -> str: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return "" + items = data.get("items") or [] + if not isinstance(items, list) or not items: + return "" + normalized = (arch or "").strip().lower() + if normalized in ("aarch64", "arm64"): + arch_label = "arm64" + elif normalized in ("x86_64", "x86-64", "amd64"): + arch_label = "amd64" + else: + arch_label = normalized + total = 0 + for node in items: + labels = (node.get("metadata") or {}).get("labels") or {} + if labels.get("kubernetes.io/arch") == arch_label: + total += 1 + return f"{cluster_name} cluster has {total} {arch_label} nodes." + def _strip_code_fence(text: str) -> str: cleaned = (text or "").strip() match = CODE_FENCE_RE.match(cleaned) @@ -622,7 +646,7 @@ def ollama_reply(hist_key, prompt: str, *, context: str) -> str: if API_KEY: headers["x-api-key"] = API_KEY r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) - with request.urlopen(r, timeout=20) as resp: + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: data = json.loads(resp.read().decode()) raw_reply = data.get("message") or data.get("response") or data.get("reply") or data reply = _normalize_reply(raw_reply) or "I'm here to help." @@ -692,6 +716,19 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue + if "node" in lower_body and any(word in lower_body for word in ("arm64", "aarch64", "amd64", "x86_64", "x86-64")): + if any(word in lower_body for word in ("cluster", "atlas", "titan")): + arch = "arm64" if "arm64" in lower_body or "aarch64" in lower_body else "amd64" + summary = nodes_arch_summary("Atlas", arch) + if not summary: + send_msg( + token, + rid, + "I couldn’t reach the cluster API to count nodes by architecture. Try again in a moment.", + ) + continue + send_msg(token, rid, summary) + continue if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body): if any(word in lower_body for word in ("cluster", "atlas", "titan")): names_summary = nodes_names_summary("Atlas") From 9b09b939215992f40b3a8b0a7ec8666431843797 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 09:38:38 -0300 Subject: [PATCH 234/416] comms: bump atlasbot configmap checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 70844ebf..a8a30092 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-9 + checksum/atlasbot-configmap: manual-atlasbot-10 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 6c84cf60c60a662e36a6ff986dfe23e2da93f2ee Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 11:44:28 -0300 Subject: [PATCH 235/416] ai-llm: tighten gpu placement and resources --- services/ai-llm/deployment.yaml | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml index dfa1bdd1..4f34d866 100644 --- a/services/ai-llm/deployment.yaml +++ b/services/ai-llm/deployment.yaml @@ -21,8 +21,8 @@ spec: app: ollama annotations: ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0 - ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24) - ai.bstein.dev/restartedAt: "2026-01-25T19:10:00Z" + ai.bstein.dev/gpu: GPU pool (titan-22/24) + ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z" spec: affinity: nodeAffinity: @@ -32,8 +32,6 @@ spec: - key: kubernetes.io/hostname operator: In values: - - titan-20 - - titan-21 - titan-22 - titan-24 runtimeClassName: nvidia @@ -69,8 +67,8 @@ spec: mountPath: /root/.ollama resources: requests: - cpu: 250m - memory: 1Gi + cpu: 500m + memory: 2Gi nvidia.com/gpu.shared: 1 limits: nvidia.com/gpu.shared: 1 @@ -97,10 +95,10 @@ spec: mountPath: /root/.ollama resources: requests: - cpu: "2" - memory: 8Gi + cpu: "4" + memory: 16Gi nvidia.com/gpu.shared: 1 limits: - cpu: "4" - memory: 12Gi + cpu: "8" + memory: 24Gi nvidia.com/gpu.shared: 1 From b5e8192731053cb6bfcfde861f729f01f75af7af Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 12:06:48 -0300 Subject: [PATCH 236/416] atlasbot: answer jetson nodes from knowledge --- knowledge/catalog/atlas.json | 2 +- knowledge/catalog/atlas.yaml | 2 +- knowledge/catalog/runbooks.json | 8 ++++ scripts/knowledge_render_atlas.py | 9 ++++- services/comms/knowledge/catalog/atlas.json | 2 +- services/comms/knowledge/catalog/atlas.yaml | 2 +- .../comms/knowledge/catalog/runbooks.json | 8 ++++ services/comms/scripts/atlasbot/bot.py | 39 +++++++++++++++++++ 8 files changed, 66 insertions(+), 6 deletions(-) diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json index 21ac4073..951c8079 100644 --- a/knowledge/catalog/atlas.json +++ b/knowledge/catalog/atlas.json @@ -1057,7 +1057,7 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/ariadne:0.1.0-48" + "registry.bstein.dev/bstein/ariadne:0.1.0-49" ] }, { diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml index b3b0119f..637b5f97 100644 --- a/knowledge/catalog/atlas.yaml +++ b/knowledge/catalog/atlas.yaml @@ -711,7 +711,7 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/ariadne:0.1.0-48 + - registry.bstein.dev/bstein/ariadne:0.1.0-49 - kind: Deployment namespace: maintenance name: maintenance-vault-sync diff --git a/knowledge/catalog/runbooks.json b/knowledge/catalog/runbooks.json index 0718562b..960510d2 100644 --- a/knowledge/catalog/runbooks.json +++ b/knowledge/catalog/runbooks.json @@ -85,5 +85,13 @@ "clusters/atlas/<...>" ], "body": "# \n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)" + }, + { + "path": "software/metis.md", + "title": "metis", + "tags": [], + "entrypoints": [], + "source_paths": [], + "body": "# Metis (node recovery)\n\n## Node classes (current map)\n- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)\n- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)\n- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)\n- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)\n- amd64 agents: titan-22/24 (Debian 13, k3s agent)\n- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.\n\n### Jetson nodes (titan-20/21)\n- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.\n- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).\n- k3s agent with drop-in 99-nofile.conf.\n\n## Longhorn disk UUIDs (critical nodes)\n- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)\n- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)\n- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)\n- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)\n\n## Metis repo (~/Development/metis)\n- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).\n- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).\n- `AGENTS.md` in repo is untracked and holds raw notes.\n\n## Next implementation steps\n- Add per-class golden image refs and checksums (Harbor or file://) when ready.\n- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.\n- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.\n- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.\n\n## Node OS/Kernel/CRI snapshot (Jan 2026)\n- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n\n\n### External hosts\n- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.\n- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).\n- titan-23/oceanus: TODO audit (future).\n\n\n### Control plane Pis (titan-0a/0b/0c)\n- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.\n- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.\n- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).\n\n\n## k3s versions\n- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)\n- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)\n- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2" } ] diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py index 34938e74..206dcd90 100644 --- a/scripts/knowledge_render_atlas.py +++ b/scripts/knowledge_render_atlas.py @@ -529,9 +529,14 @@ def main() -> int: diagram_path.write_text(diagram, encoding="utf-8") # Render runbooks into JSON for lightweight, dependency-free consumption in-cluster. - runbooks_dir = out_dir / "runbooks" + runbook_dirs = [ + out_dir / "runbooks", + out_dir / "software", + ] runbooks: list[dict[str, Any]] = [] - if runbooks_dir.exists(): + for runbooks_dir in runbook_dirs: + if not runbooks_dir.exists(): + continue for md_file in sorted(runbooks_dir.glob("*.md")): raw = md_file.read_text(encoding="utf-8") fm: dict[str, Any] = {} diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json index 21ac4073..951c8079 100644 --- a/services/comms/knowledge/catalog/atlas.json +++ b/services/comms/knowledge/catalog/atlas.json @@ -1057,7 +1057,7 @@ "node-role.kubernetes.io/worker": "true" }, "images": [ - "registry.bstein.dev/bstein/ariadne:0.1.0-48" + "registry.bstein.dev/bstein/ariadne:0.1.0-49" ] }, { diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml index b3b0119f..637b5f97 100644 --- a/services/comms/knowledge/catalog/atlas.yaml +++ b/services/comms/knowledge/catalog/atlas.yaml @@ -711,7 +711,7 @@ workloads: kubernetes.io/arch: arm64 node-role.kubernetes.io/worker: 'true' images: - - registry.bstein.dev/bstein/ariadne:0.1.0-48 + - registry.bstein.dev/bstein/ariadne:0.1.0-49 - kind: Deployment namespace: maintenance name: maintenance-vault-sync diff --git a/services/comms/knowledge/catalog/runbooks.json b/services/comms/knowledge/catalog/runbooks.json index 0718562b..960510d2 100644 --- a/services/comms/knowledge/catalog/runbooks.json +++ b/services/comms/knowledge/catalog/runbooks.json @@ -85,5 +85,13 @@ "clusters/atlas/<...>" ], "body": "# \n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)" + }, + { + "path": "software/metis.md", + "title": "metis", + "tags": [], + "entrypoints": [], + "source_paths": [], + "body": "# Metis (node recovery)\n\n## Node classes (current map)\n- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)\n- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)\n- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)\n- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)\n- amd64 agents: titan-22/24 (Debian 13, k3s agent)\n- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.\n\n### Jetson nodes (titan-20/21)\n- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.\n- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).\n- k3s agent with drop-in 99-nofile.conf.\n\n## Longhorn disk UUIDs (critical nodes)\n- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)\n- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)\n- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)\n- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)\n\n## Metis repo (~/Development/metis)\n- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).\n- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).\n- `AGENTS.md` in repo is untracked and holds raw notes.\n\n## Next implementation steps\n- Add per-class golden image refs and checksums (Harbor or file://) when ready.\n- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.\n- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.\n- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.\n\n## Node OS/Kernel/CRI snapshot (Jan 2026)\n- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n\n\n### External hosts\n- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.\n- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).\n- titan-23/oceanus: TODO audit (future).\n\n\n### Control plane Pis (titan-0a/0b/0c)\n- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.\n- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.\n- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).\n\n\n## k3s versions\n- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)\n- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)\n- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2" } ] diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 797b601d..18ec611a 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -75,6 +75,8 @@ METRIC_HINT_WORDS = { } CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL) +TITAN_NODE_RE = re.compile(r"\\btitan-[0-9a-z]{2}\\b", re.IGNORECASE) +TITAN_RANGE_RE = re.compile(r"\\btitan-([0-9a-z]{2})/([0-9a-z]{2})\\b", re.IGNORECASE) def _tokens(text: str) -> list[str]: toks = [t.lower() for t in TOKEN_RE.findall(text or "")] @@ -233,6 +235,35 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: used += len(chunk) return "\n".join(parts).strip() +def _extract_titan_nodes(text: str) -> list[str]: + names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n} + for match in TITAN_RANGE_RE.finditer(text or ""): + left, right = match.groups() + if left: + names.add(f"titan-{left.lower()}") + if right: + names.add(f"titan-{right.lower()}") + return sorted(names) + +def jetson_nodes_from_kb() -> list[str]: + for doc in KB.get("runbooks", []): + if not isinstance(doc, dict): + continue + body = str(doc.get("body") or "") + for line in body.splitlines(): + if "jetson" not in line.lower(): + continue + names = _extract_titan_nodes(line) + if names: + return names + return [] + +def jetson_nodes_summary(cluster_name: str) -> str: + names = jetson_nodes_from_kb() + if names: + return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}." + return "" + def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]: q = (query or "").strip() if not q or not KB.get("catalog"): @@ -729,6 +760,14 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue + if "jetson" in lower_body: + if any(word in lower_body for word in ("cluster", "atlas", "titan", "node", "nodes")): + summary = jetson_nodes_summary("Atlas") + if summary: + send_msg(token, rid, summary) + else: + send_msg(token, rid, "Jetson inventory is not available in the knowledge base yet.") + continue if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body): if any(word in lower_body for word in ("cluster", "atlas", "titan")): names_summary = nodes_names_summary("Atlas") From a1494a75218ef893dc5b331b025957a249cb4a4f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 12:08:33 -0300 Subject: [PATCH 237/416] comms: bump atlasbot config checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index a8a30092..c96c79c4 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-10 + checksum/atlasbot-configmap: manual-atlasbot-11 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From e7c3d25dfcbf97b06053fb42374569ca08a7e14e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 12:36:51 -0300 Subject: [PATCH 238/416] atlasbot: ground node inventory and soften llm failures --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 216 ++++++++++++++++++++---- 2 files changed, 181 insertions(+), 37 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index c96c79c4..2c08853d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-11 + checksum/atlasbot-configmap: manual-atlasbot-12 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 18ec611a..8edc28dc 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3,6 +3,7 @@ import json import os import re import ssl +import threading import time from typing import Any from urllib import error, parse, request @@ -156,6 +157,13 @@ def send_msg(token: str, room: str, text: str): KB = {"catalog": {}, "runbooks": []} _HOST_INDEX: dict[str, list[dict]] = {} _NAME_INDEX: set[str] = set() +_NODE_CLASS_INDEX: dict[str, list[str]] = {} +_NODE_CLASS_RPI4: set[str] = set() +_NODE_CLASS_RPI5: set[str] = set() +_NODE_CLASS_AMD64: set[str] = set() +_NODE_CLASS_JETSON: set[str] = set() +_NODE_CLASS_EXTERNAL: set[str] = set() +_NODE_CLASS_NON_RPI: set[str] = set() def _load_json_file(path: str) -> Any | None: try: @@ -166,6 +174,8 @@ def _load_json_file(path: str) -> Any | None: def load_kb(): global KB, _HOST_INDEX, _NAME_INDEX + global _NODE_CLASS_INDEX, _NODE_CLASS_RPI4, _NODE_CLASS_RPI5, _NODE_CLASS_AMD64, _NODE_CLASS_JETSON + global _NODE_CLASS_EXTERNAL, _NODE_CLASS_NON_RPI if not KB_DIR: return catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {} @@ -188,6 +198,24 @@ def load_kb(): names.add(str(w["name"]).lower()) _NAME_INDEX = names + node_classes = _parse_node_classes(runbooks) + _NODE_CLASS_INDEX = node_classes + _NODE_CLASS_RPI4 = set(node_classes.get("rpi4", [])) + _NODE_CLASS_RPI5 = set(node_classes.get("rpi5", [])) + _NODE_CLASS_AMD64 = set(node_classes.get("amd64", [])) + _NODE_CLASS_JETSON = set(node_classes.get("jetson", [])) + _NODE_CLASS_EXTERNAL = set(node_classes.get("external", [])) + _NODE_CLASS_NON_RPI = set( + sorted( + ( + set().union(*node_classes.values()) + - _NODE_CLASS_RPI4 + - _NODE_CLASS_RPI5 + - _NODE_CLASS_EXTERNAL + ) + ) + ) + def kb_retrieve(query: str, *, limit: int = 3) -> str: q = (query or "").strip() if not q or not KB.get("runbooks"): @@ -237,6 +265,12 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: def _extract_titan_nodes(text: str) -> list[str]: names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n} + for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", text or "", re.IGNORECASE): + tail = match.group(1) + for part in re.split(r"[/,]", tail): + part = part.strip() + if part: + names.add(f"titan-{part.lower()}") for match in TITAN_RANGE_RE.finditer(text or ""): left, right = match.groups() if left: @@ -245,6 +279,83 @@ def _extract_titan_nodes(text: str) -> list[str]: names.add(f"titan-{right.lower()}") return sorted(names) +def _parse_node_classes(runbooks: list[dict[str, Any]]) -> dict[str, list[str]]: + classes: dict[str, list[str]] = {} + for doc in runbooks: + if not isinstance(doc, dict): + continue + body = str(doc.get("body") or "") + for line in body.splitlines(): + stripped = line.strip() + if "titan-" not in stripped.lower(): + continue + label = "" + nodes: list[str] = [] + if stripped.startswith("-") and ":" in stripped: + label, rest = stripped.lstrip("-").split(":", 1) + nodes = _extract_titan_nodes(rest) + label = label.strip().lower() + else: + nodes = _extract_titan_nodes(stripped) + if not nodes: + continue + if "jetson" in stripped.lower(): + classes.setdefault("jetson", nodes) + if "amd64" in stripped.lower() or "x86" in stripped.lower(): + classes.setdefault("amd64", nodes) + if "rpi4" in stripped.lower(): + classes.setdefault("rpi4", nodes) + if "rpi5" in stripped.lower(): + classes.setdefault("rpi5", nodes) + if "external" in stripped.lower() or "non-cluster" in stripped.lower(): + classes.setdefault("external", nodes) + if label: + classes.setdefault(label, nodes) + return {k: sorted(set(v)) for k, v in classes.items()} + +def node_inventory_answer(cluster_name: str, query: str) -> str: + q = (query or "").lower() + if "jetson" in q and _NODE_CLASS_JETSON: + names = sorted(_NODE_CLASS_JETSON) + return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}." + if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: + names = sorted(_NODE_CLASS_NON_RPI) + if names: + return f"{cluster_name} non‑Raspberry Pi nodes: {', '.join(names)}." + if "raspberry" in q or "rpi" in q: + if "rpi4" in q and _NODE_CLASS_RPI4: + names = sorted(_NODE_CLASS_RPI4) + return f"{cluster_name} rpi4 nodes: {', '.join(names)}." + if "rpi5" in q and _NODE_CLASS_RPI5: + names = sorted(_NODE_CLASS_RPI5) + return f"{cluster_name} rpi5 nodes: {', '.join(names)}." + names = sorted(_NODE_CLASS_RPI4 | _NODE_CLASS_RPI5) + if names: + return f"{cluster_name} Raspberry Pi nodes: {', '.join(names)}." + if ("amd64" in q or "x86" in q) and _NODE_CLASS_AMD64: + names = sorted(_NODE_CLASS_AMD64) + return f"{cluster_name} amd64 nodes: {', '.join(names)}." + return "" + +def node_inventory_context(query: str) -> str: + q = (query or "").lower() + if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "x86", "cluster")): + return "" + lines: list[str] = ["Node inventory (KB):"] + if _NODE_CLASS_RPI5: + lines.append(f"- rpi5: {', '.join(sorted(_NODE_CLASS_RPI5))}") + if _NODE_CLASS_RPI4: + lines.append(f"- rpi4: {', '.join(sorted(_NODE_CLASS_RPI4))}") + if _NODE_CLASS_JETSON: + lines.append(f"- jetson: {', '.join(sorted(_NODE_CLASS_JETSON))}") + if _NODE_CLASS_AMD64: + lines.append(f"- amd64: {', '.join(sorted(_NODE_CLASS_AMD64))}") + if _NODE_CLASS_EXTERNAL: + lines.append(f"- external: {', '.join(sorted(_NODE_CLASS_EXTERNAL))}") + if len(lines) == 1: + return "" + return "\n".join(lines) + def jetson_nodes_from_kb() -> list[str]: for doc in KB.get("runbooks", []): if not isinstance(doc, dict): @@ -627,6 +738,10 @@ def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, st if endpoints: parts.append(endpoints) + inventory = node_inventory_context(prompt) + if inventory: + parts.append(inventory) + if allow_tools: # Scope pod summaries to relevant namespaces/workloads when possible. prefixes_by_ns: dict[str, set[str]] = collections.defaultdict(set) @@ -656,35 +771,58 @@ def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, st return "\n\n".join([p for p in parts if p]).strip() -def ollama_reply(hist_key, prompt: str, *, context: str) -> str: - try: - system = ( - "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " - "Be helpful, direct, and concise. " - "Prefer answering with exact repo paths and Kubernetes resource names. " - "Never include or request secret values. " - "Respond in plain sentences; do not return JSON or code fences unless explicitly asked." - ) - transcript_parts = [system] - if context: - transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS]) - transcript_parts.extend(history[hist_key][-24:]) - transcript_parts.append(f"User: {prompt}") - transcript = "\n".join(transcript_parts) +def _ollama_call(hist_key, prompt: str, *, context: str) -> str: + system = ( + "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " + "Be helpful, direct, and concise. " + "Prefer answering with exact repo paths and Kubernetes resource names. " + "Never include or request secret values. " + "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " + "If the answer is not grounded in the provided context or tool data, say you do not know." + ) + transcript_parts = [system] + if context: + transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS]) + transcript_parts.extend(history[hist_key][-24:]) + transcript_parts.append(f"User: {prompt}") + transcript = "\n".join(transcript_parts) - payload = {"model": MODEL, "message": transcript} - headers = {"Content-Type": "application/json"} - if API_KEY: - headers["x-api-key"] = API_KEY - r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) - with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: - data = json.loads(resp.read().decode()) - raw_reply = data.get("message") or data.get("response") or data.get("reply") or data - reply = _normalize_reply(raw_reply) or "I'm here to help." - history[hist_key].append(f"Atlas: {reply}") - return reply + payload = {"model": MODEL, "message": transcript} + headers = {"Content-Type": "application/json"} + if API_KEY: + headers["x-api-key"] = API_KEY + r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: + data = json.loads(resp.read().decode()) + raw_reply = data.get("message") or data.get("response") or data.get("reply") or data + reply = _normalize_reply(raw_reply) or "I'm here to help." + history[hist_key].append(f"Atlas: {reply}") + return reply + +def ollama_reply(hist_key, prompt: str, *, context: str, fallback: str = "") -> str: + try: + return _ollama_call(hist_key, prompt, context=context) except Exception: - return "I’m here — but I couldn’t reach the model backend." + if fallback: + history[hist_key].append(f"Atlas: {fallback}") + return fallback + return "Model backend is busy. Try again in a moment." + +def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, context: str, fallback: str) -> str: + result: dict[str, str] = {"reply": ""} + done = threading.Event() + + def worker(): + result["reply"] = ollama_reply(hist_key, prompt, context=context, fallback=fallback) + done.set() + + thread = threading.Thread(target=worker, daemon=True) + thread.start() + if not done.wait(2.0): + send_msg(token, room, "Thinking…") + done.wait() + thread.join(timeout=1) + return result["reply"] or fallback or "Model backend is busy. Try again in a moment." def sync_loop(token: str, room_id: str): since = None @@ -747,6 +885,10 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue + inventory_answer = node_inventory_answer("Atlas", lower_body) + if inventory_answer: + send_msg(token, rid, inventory_answer) + continue if "node" in lower_body and any(word in lower_body for word in ("arm64", "aarch64", "amd64", "x86_64", "x86-64")): if any(word in lower_body for word in ("cluster", "atlas", "titan")): arch = "arm64" if "arm64" in lower_body or "aarch64" in lower_body else "amd64" @@ -760,14 +902,6 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue - if "jetson" in lower_body: - if any(word in lower_body for word in ("cluster", "atlas", "titan", "node", "nodes")): - summary = jetson_nodes_summary("Atlas") - if summary: - send_msg(token, rid, summary) - else: - send_msg(token, rid, "Jetson inventory is not available in the knowledge base yet.") - continue if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body): if any(word in lower_body for word in ("cluster", "atlas", "titan")): names_summary = nodes_names_summary("Atlas") @@ -803,7 +937,17 @@ def sync_loop(token: str, room_id: str): rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra - reply = ollama_reply(hist_key, body, context=context) + fallback = "" + if "node" in lower_body or "cluster" in lower_body: + fallback = node_inventory_answer("Atlas", lower_body) + reply = ollama_reply_with_thinking( + token, + rid, + hist_key, + body, + context=context, + fallback=fallback, + ) send_msg(token, rid, reply) def login_with_retry(): From 4f9479c7d568ed50582f17b38d4fe7eae1e8bcf9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 14:08:11 -0300 Subject: [PATCH 239/416] atlasbot: add metrics kb and long timeout --- knowledge/catalog/metrics.json | 1880 +++++++++++++++++ scripts/knowledge_render_atlas.py | 65 + .../bstein-dev-home/backend-deployment.yaml | 4 +- .../chat-ai-gateway-deployment.yaml | 2 + services/bstein-dev-home/scripts/gateway.py | 3 +- services/comms/atlasbot-deployment.yaml | 8 +- services/comms/knowledge/catalog/metrics.json | 1880 +++++++++++++++++ services/comms/kustomization.yaml | 1 + services/comms/scripts/atlasbot/bot.py | 97 +- 9 files changed, 3934 insertions(+), 6 deletions(-) create mode 100644 knowledge/catalog/metrics.json create mode 100644 services/comms/knowledge/catalog/metrics.json diff --git a/knowledge/catalog/metrics.json b/knowledge/catalog/metrics.json new file mode 100644 index 00000000..e929db58 --- /dev/null +++ b/knowledge/catalog/metrics.json @@ -0,0 +1,1880 @@ +[ + { + "dashboard": "Atlas GPU", + "panel_title": "Namespace GPU Share", + "panel_id": 1, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Namespace", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Node", + "panel_id": 3, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "Top Pods by GPU Util", + "panel_id": 4, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (range)", + "panel_id": 1, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 3, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Stale (>36h)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Missing Success", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Suspended", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (24h)", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Runs (1h)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Error (hours ago)", + "panel_id": 10, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Success (hours ago)", + "panel_id": 11, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Success (hours ago)", + "panel_id": 12, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Schedule (hours ago)", + "panel_id": 13, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 14, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (30d)", + "panel_id": 15, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Access Requests", + "panel_id": 16, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(ariadne_access_requests_total)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Coverage (%)", + "panel_id": 17, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_coverage_percent{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Tests (latest)", + "panel_id": 18, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_tests_total{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (7d)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"7d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Mail Bounces (1d)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Success Rate (1d)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Limit Used (30d)", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Send Limit (30d)", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Last Success", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_last_success_timestamp_seconds)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounce Rate (1d vs 7d)", + "panel_id": 13, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounce_rate)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounced (1d vs 7d)", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounced)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d vs 7d)", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_sent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Success Rate (5m)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (1h)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (6h)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Edge P99 Latency (ms)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Traffic", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Egress Traffic", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Intra-Cluster Traffic", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Per-Node Throughput", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Namespaces", + "panel_id": 9, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Pods", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Routers (req/s)", + "panel_id": 11, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Entrypoints (req/s)", + "panel_id": 12, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Worker Nodes Ready", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server 5xx rate", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server P99 latency", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "etcd P99 latency", + "panel_id": 11, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node CPU", + "panel_id": 4, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node RAM", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) CPU", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) RAM", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Root Filesystem Usage", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Stuck Terminating", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Atlas Availability", + "panel_id": 27, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Problem Pods", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Workers Ready", + "panel_id": 1, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: CPU", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: RAM", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: NET (rx+tx)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: I/O (r+w)", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Sent (1d)", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Bounces (1d)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Success Rate (1d)", + "panel_id": 32, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Limit Used (30d)", + "panel_id": 33, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Connections Used", + "panel_id": 34, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Hottest Connections", + "panel_id": 35, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(1, sum by (datname) (pg_stat_activity_count))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Usage", + "panel_id": 23, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Usage", + "panel_id": 24, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Free", + "panel_id": 25, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Free", + "panel_id": 26, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 40, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 41, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Test Success Rate", + "panel_id": 42, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Tests with Failures (24h)", + "panel_id": 43, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace CPU Share", + "panel_id": 11, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace GPU Share", + "panel_id": 12, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace RAM Share", + "panel_id": 13, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node CPU", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node RAM", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane CPU", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane RAM", + "panel_id": 17, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Node Pod Share", + "panel_id": 28, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 29, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Ingress Throughput", + "panel_id": 18, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Egress Throughput", + "panel_id": 19, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Intra-Cluster Throughput", + "panel_id": 20, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Root Filesystem Usage", + "panel_id": 21, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Nodes Closest to Full Root Disks", + "panel_id": 22, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Problem Pods", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Stuck Terminating (>10m)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Control Plane Workloads", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Pods Not Running", + "panel_id": 5, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Terminating >10m", + "panel_id": 7, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Node Pod Share", + "panel_id": 8, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 9, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Namespace Plurality by Node v27", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Free", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Free", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Per-Node Usage", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Per-Node Usage", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage History", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage History", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Sweepers Ready", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Cron Freshness (s)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})" + ] + } +] diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py index 206dcd90..1e305cbb 100644 --- a/scripts/knowledge_render_atlas.py +++ b/scripts/knowledge_render_atlas.py @@ -26,6 +26,7 @@ from typing import Any, Iterable import yaml REPO_ROOT = Path(__file__).resolve().parents[1] +DASHBOARD_DIR = REPO_ROOT / "services" / "monitoring" / "dashboards" CLUSTER_SCOPED_KINDS = { "Namespace", @@ -67,6 +68,64 @@ def _sync_tree(source: Path, dest: Path) -> None: shutil.copytree(source, dest) +def _iter_dashboard_panels(dashboard: dict[str, Any]) -> Iterable[dict[str, Any]]: + panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else [] + for panel in panels: + if not isinstance(panel, dict): + continue + if panel.get("type") == "row" and isinstance(panel.get("panels"), list): + yield from _iter_dashboard_panels({"panels": panel.get("panels")}) + continue + yield panel + + +def _extract_metrics_index(dashboard_dir: Path) -> list[dict[str, Any]]: + index: list[dict[str, Any]] = [] + for path in sorted(dashboard_dir.glob("*.json")): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + continue + if not isinstance(data, dict): + continue + dash_title = data.get("title") or path.stem + dash_tags = data.get("tags") or [] + for panel in _iter_dashboard_panels(data): + targets = panel.get("targets") + if not isinstance(targets, list): + continue + exprs: list[str] = [] + for target in targets: + if not isinstance(target, dict): + continue + expr = target.get("expr") + if isinstance(expr, str) and expr.strip(): + exprs.append(expr.strip()) + if not exprs: + continue + datasource = panel.get("datasource") or {} + if isinstance(datasource, dict): + ds_uid = datasource.get("uid") + ds_type = datasource.get("type") + else: + ds_uid = None + ds_type = None + index.append( + { + "dashboard": dash_title, + "panel_title": panel.get("title") or "", + "panel_id": panel.get("id"), + "panel_type": panel.get("type"), + "description": panel.get("description") or "", + "tags": dash_tags, + "datasource_uid": ds_uid, + "datasource_type": ds_type, + "exprs": exprs, + } + ) + return index + + def kustomize_build(path: Path) -> str: rel = path.relative_to(REPO_ROOT) try: @@ -516,6 +575,7 @@ def main() -> int: summary_path = out_dir / "catalog" / "atlas-summary.json" diagram_path = out_dir / "diagrams" / "atlas-http.mmd" runbooks_json_path = out_dir / "catalog" / "runbooks.json" + metrics_json_path = out_dir / "catalog" / "metrics.json" catalog_rel = catalog_path.relative_to(REPO_ROOT).as_posix() catalog_path.write_text( @@ -560,12 +620,17 @@ def main() -> int: } ) runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8") + metrics_index = _extract_metrics_index(DASHBOARD_DIR) + metrics_json_path.write_text( + json.dumps(metrics_index, indent=2, sort_keys=False) + "\n", encoding="utf-8" + ) print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}") print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}") print(f"Wrote {summary_path.relative_to(REPO_ROOT)}") print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}") print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}") + print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}") if args.sync_comms: comms_dir = REPO_ROOT / "services" / "comms" / "knowledge" diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 2170396e..ecf478cc 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -58,14 +58,14 @@ spec: args: - >- . /vault/secrets/portal-env.sh - && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 180 app:app + && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 600 app:app env: - name: AI_CHAT_API value: http://ollama.ai.svc.cluster.local:11434 - name: AI_CHAT_MODEL value: qwen2.5-coder:7b-instruct-q4_0 - name: AI_CHAT_TIMEOUT_SEC - value: "60" + value: "480" - name: AI_NODE_NAME valueFrom: fieldRef: diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml index 40d74fe1..7209da62 100644 --- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml +++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml @@ -47,6 +47,8 @@ spec: env: - name: UPSTREAM_URL value: http://bstein-dev-home-backend/api/chat + - name: UPSTREAM_TIMEOUT_SEC + value: "600" ports: - name: http containerPort: 8080 diff --git a/services/bstein-dev-home/scripts/gateway.py b/services/bstein-dev-home/scripts/gateway.py index 3ca2fa16..19d36062 100644 --- a/services/bstein-dev-home/scripts/gateway.py +++ b/services/bstein-dev-home/scripts/gateway.py @@ -6,6 +6,7 @@ from urllib import request, error UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat") KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "") KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "") +UPSTREAM_TIMEOUT_SEC = float(os.environ.get("UPSTREAM_TIMEOUT_SEC", "90")) ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k} @@ -41,7 +42,7 @@ class Handler(BaseHTTPRequestHandler): headers={"Content-Type": "application/json"}, method="POST", ) - with request.urlopen(upstream_req, timeout=90) as resp: + with request.urlopen(upstream_req, timeout=UPSTREAM_TIMEOUT_SEC) as resp: data = resp.read() self.send_response(resp.status) for k, v in resp.headers.items(): diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 2c08853d..031abb8d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-12 + checksum/atlasbot-configmap: manual-atlasbot-13 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -83,6 +83,10 @@ spec: value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL value: qwen2.5-coder:7b-instruct-q4_0 + - name: OLLAMA_TIMEOUT_SEC + value: "480" + - name: ATLASBOT_THINKING_INTERVAL_SEC + value: "120" resources: requests: cpu: 100m @@ -114,6 +118,8 @@ spec: path: catalog/atlas.json - key: atlas-summary.json path: catalog/atlas-summary.json + - key: metrics.json + path: catalog/metrics.json - key: runbooks.json path: catalog/runbooks.json - key: atlas-http.mmd diff --git a/services/comms/knowledge/catalog/metrics.json b/services/comms/knowledge/catalog/metrics.json new file mode 100644 index 00000000..e929db58 --- /dev/null +++ b/services/comms/knowledge/catalog/metrics.json @@ -0,0 +1,1880 @@ +[ + { + "dashboard": "Atlas GPU", + "panel_title": "Namespace GPU Share", + "panel_id": 1, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Namespace", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Node", + "panel_id": 3, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "Top Pods by GPU Util", + "panel_id": 4, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (range)", + "panel_id": 1, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 3, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Stale (>36h)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Missing Success", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Suspended", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (24h)", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Runs (1h)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Error (hours ago)", + "panel_id": 10, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Success (hours ago)", + "panel_id": 11, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Success (hours ago)", + "panel_id": 12, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Schedule (hours ago)", + "panel_id": 13, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 14, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (30d)", + "panel_id": 15, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Access Requests", + "panel_id": 16, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(ariadne_access_requests_total)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Coverage (%)", + "panel_id": 17, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_coverage_percent{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Tests (latest)", + "panel_id": 18, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_tests_total{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (7d)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"7d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Mail Bounces (1d)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Success Rate (1d)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Limit Used (30d)", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Send Limit (30d)", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Last Success", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_last_success_timestamp_seconds)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounce Rate (1d vs 7d)", + "panel_id": 13, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounce_rate)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounced (1d vs 7d)", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounced)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d vs 7d)", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_sent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Success Rate (5m)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (1h)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (6h)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Edge P99 Latency (ms)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Traffic", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Egress Traffic", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Intra-Cluster Traffic", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Per-Node Throughput", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Namespaces", + "panel_id": 9, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Pods", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Routers (req/s)", + "panel_id": 11, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Entrypoints (req/s)", + "panel_id": 12, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Worker Nodes Ready", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server 5xx rate", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server P99 latency", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "etcd P99 latency", + "panel_id": 11, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node CPU", + "panel_id": 4, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node RAM", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) CPU", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) RAM", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Root Filesystem Usage", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Stuck Terminating", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Atlas Availability", + "panel_id": 27, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Problem Pods", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Workers Ready", + "panel_id": 1, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: CPU", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: RAM", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: NET (rx+tx)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: I/O (r+w)", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Sent (1d)", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Bounces (1d)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Success Rate (1d)", + "panel_id": 32, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Limit Used (30d)", + "panel_id": 33, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Connections Used", + "panel_id": 34, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Hottest Connections", + "panel_id": 35, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(1, sum by (datname) (pg_stat_activity_count))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Usage", + "panel_id": 23, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Usage", + "panel_id": 24, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Free", + "panel_id": 25, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Free", + "panel_id": 26, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 40, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 41, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Test Success Rate", + "panel_id": 42, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Tests with Failures (24h)", + "panel_id": 43, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace CPU Share", + "panel_id": 11, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace GPU Share", + "panel_id": 12, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace RAM Share", + "panel_id": 13, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node CPU", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node RAM", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane CPU", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane RAM", + "panel_id": 17, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Node Pod Share", + "panel_id": 28, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 29, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Ingress Throughput", + "panel_id": 18, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Egress Throughput", + "panel_id": 19, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Intra-Cluster Throughput", + "panel_id": 20, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Root Filesystem Usage", + "panel_id": 21, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Nodes Closest to Full Root Disks", + "panel_id": 22, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Problem Pods", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Stuck Terminating (>10m)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Control Plane Workloads", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Pods Not Running", + "panel_id": 5, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Terminating >10m", + "panel_id": 7, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Node Pod Share", + "panel_id": 8, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 9, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Namespace Plurality by Node v27", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Free", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Free", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Per-Node Usage", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Per-Node Usage", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage History", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage History", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Sweepers Ready", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Cron Freshness (s)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})" + ] + } +] diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml index 33600676..37f681de 100644 --- a/services/comms/kustomization.yaml +++ b/services/comms/kustomization.yaml @@ -73,5 +73,6 @@ configMapGenerator: - INDEX.md=knowledge/INDEX.md - atlas.json=knowledge/catalog/atlas.json - atlas-summary.json=knowledge/catalog/atlas-summary.json + - metrics.json=knowledge/catalog/metrics.json - runbooks.json=knowledge/catalog/runbooks.json - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 8edc28dc..e604e65f 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -17,7 +17,7 @@ ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") API_KEY = os.environ.get("CHAT_API_KEY", "") -OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "90")) +OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") @@ -29,6 +29,7 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500")) +THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120")) TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE) HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)") @@ -59,8 +60,21 @@ STOPWORDS = { } METRIC_HINT_WORDS = { + "bandwidth", + "connections", + "cpu", + "database", + "db", + "disk", "health", + "memory", + "network", + "node", + "nodes", + "postgres", "status", + "storage", + "usage", "down", "slow", "error", @@ -157,6 +171,7 @@ def send_msg(token: str, room: str, text: str): KB = {"catalog": {}, "runbooks": []} _HOST_INDEX: dict[str, list[dict]] = {} _NAME_INDEX: set[str] = set() +_METRIC_INDEX: list[dict[str, Any]] = [] _NODE_CLASS_INDEX: dict[str, list[str]] = {} _NODE_CLASS_RPI4: set[str] = set() _NODE_CLASS_RPI5: set[str] = set() @@ -180,6 +195,7 @@ def load_kb(): return catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {} runbooks = _load_json_file(os.path.join(KB_DIR, "catalog", "runbooks.json")) or [] + metrics = _load_json_file(os.path.join(KB_DIR, "catalog", "metrics.json")) or [] KB = {"catalog": catalog, "runbooks": runbooks} host_index: dict[str, list[dict]] = collections.defaultdict(list) @@ -197,6 +213,7 @@ def load_kb(): if isinstance(w, dict) and w.get("name"): names.add(str(w["name"]).lower()) _NAME_INDEX = names + _METRIC_INDEX = metrics if isinstance(metrics, list) else [] node_classes = _parse_node_classes(runbooks) _NODE_CLASS_INDEX = node_classes @@ -356,6 +373,65 @@ def node_inventory_context(query: str) -> str: return "" return "\n".join(lines) +def _metric_tokens(entry: dict[str, Any]) -> str: + parts: list[str] = [] + for key in ("panel_title", "dashboard", "description"): + val = entry.get(key) + if isinstance(val, str) and val: + parts.append(val.lower()) + tags = entry.get("tags") + if isinstance(tags, list): + parts.extend(str(t).lower() for t in tags if t) + return " ".join(parts) + +def metrics_lookup(query: str, limit: int = 3) -> list[dict[str, Any]]: + q_tokens = _tokens(query) + if not q_tokens or not _METRIC_INDEX: + return [] + scored: list[tuple[int, dict[str, Any]]] = [] + for entry in _METRIC_INDEX: + if not isinstance(entry, dict): + continue + hay = _metric_tokens(entry) + if not hay: + continue + score = 0 + for t in set(q_tokens): + if t in hay: + score += 2 if t in (entry.get("panel_title") or "").lower() else 1 + if score: + scored.append((score, entry)) + scored.sort(key=lambda item: item[0], reverse=True) + return [entry for _, entry in scored[:limit]] + +def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: + if not allow_tools: + return "", "" + lower = (prompt or "").lower() + if not any(word in lower for word in METRIC_HINT_WORDS): + return "", "" + matches = metrics_lookup(prompt, limit=1) + if not matches: + return "", "" + entry = matches[0] + exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else [] + if not exprs: + return "", "" + rendered_parts: list[str] = [] + for expr in exprs[:2]: + res = vm_query(expr, timeout=20) + rendered = vm_render_result(res, limit=10) + if rendered: + rendered_parts.append(rendered) + if not rendered_parts: + return "", "" + dashboard = entry.get("dashboard") or "dashboard" + panel = entry.get("panel_title") or "panel" + summary = "\n".join(rendered_parts) + context = f"Metrics (from {dashboard} / {panel}):\n{summary}" + fallback = f"{panel}: {summary}" + return context, fallback + def jetson_nodes_from_kb() -> list[str]: for doc in KB.get("runbooks", []): if not isinstance(doc, dict): @@ -777,6 +853,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "Be helpful, direct, and concise. " "Prefer answering with exact repo paths and Kubernetes resource names. " "Never include or request secret values. " + "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " "If the answer is not grounded in the provided context or tool data, say you do not know." ) @@ -820,7 +897,17 @@ def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, thread.start() if not done.wait(2.0): send_msg(token, room, "Thinking…") - done.wait() + prompt_hint = " ".join((prompt or "").split()) + if len(prompt_hint) > 160: + prompt_hint = prompt_hint[:157] + "…" + heartbeat = max(10, THINKING_INTERVAL_SEC) + next_heartbeat = time.monotonic() + heartbeat + while not done.wait(max(0, next_heartbeat - time.monotonic())): + if prompt_hint: + send_msg(token, room, f"Still thinking about: {prompt_hint} (gathering context)") + else: + send_msg(token, room, "Still thinking (gathering context)…") + next_heartbeat += heartbeat thread.join(timeout=1) return result["reply"] or fallback or "Model backend is busy. Try again in a moment." @@ -937,9 +1024,15 @@ def sync_loop(token: str, room_id: str): rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra + metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_tools) + if metrics_context: + context = (context + "\n\n" + metrics_context).strip() if context else metrics_context + fallback = "" if "node" in lower_body or "cluster" in lower_body: fallback = node_inventory_answer("Atlas", lower_body) + if metrics_fallback and not fallback: + fallback = metrics_fallback reply = ollama_reply_with_thinking( token, rid, From 349688529409927eddef4d30a35131a7573b0136 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 15:34:52 -0300 Subject: [PATCH 240/416] atlasbot: load metrics index and answer in rooms --- services/comms/scripts/atlasbot/bot.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e604e65f..ff9019e7 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -188,7 +188,7 @@ def _load_json_file(path: str) -> Any | None: return None def load_kb(): - global KB, _HOST_INDEX, _NAME_INDEX + global KB, _HOST_INDEX, _NAME_INDEX, _METRIC_INDEX global _NODE_CLASS_INDEX, _NODE_CLASS_RPI4, _NODE_CLASS_RPI5, _NODE_CLASS_AMD64, _NODE_CLASS_JETSON global _NODE_CLASS_EXTERNAL, _NODE_CLASS_NON_RPI if not KB_DIR: @@ -414,6 +414,8 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: if not matches: return "", "" entry = matches[0] + dashboard = entry.get("dashboard") or "dashboard" + panel = entry.get("panel_title") or "panel" exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else [] if not exprs: return "", "" @@ -424,9 +426,7 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: if rendered: rendered_parts.append(rendered) if not rendered_parts: - return "", "" - dashboard = entry.get("dashboard") or "dashboard" - panel = entry.get("panel_title") or "panel" + return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data." summary = "\n".join(rendered_parts) context = f"Metrics (from {dashboard} / {panel}):\n{summary}" fallback = f"{panel}: {summary}" @@ -998,8 +998,9 @@ def sync_loop(token: str, room_id: str): send_msg(token, rid, names_summary) continue - # Only do live cluster/metrics introspection in DMs. + # Only do live cluster introspection in DMs; metrics can be answered when mentioned. allow_tools = is_dm + allow_metrics = is_dm or mentioned promql = "" if allow_tools: @@ -1024,7 +1025,7 @@ def sync_loop(token: str, room_id: str): rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra - metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_tools) + metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics) if metrics_context: context = (context + "\n\n" + metrics_context).strip() if context else metrics_context From b92bd79c98e41ad8853f46e037a86e223330f8e9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 15:54:00 -0300 Subject: [PATCH 241/416] atlasbot: recognize prefix mentions --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 031abb8d..aa91dcb1 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -78,7 +78,7 @@ spec: - name: BOT_USER value: atlasbot - name: BOT_MENTIONS - value: atlasbot + value: atlasbot,aatlasbot - name: OLLAMA_URL value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index ff9019e7..f4182cd4 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -119,9 +119,21 @@ def normalize_user_id(token: str) -> str: MENTION_USER_IDS = {normalize_user_id(t).lower() for t in MENTION_TOKENS if normalize_user_id(t)} +def _body_mentions_token(body: str) -> bool: + lower = (body or "").strip().lower() + if not lower: + return False + for token in MENTION_LOCALPARTS: + for prefix in (token, f"@{token}"): + if lower.startswith(prefix + ":") or lower.startswith(prefix + ",") or lower.startswith(prefix + " "): + return True + return False + def is_mentioned(content: dict, body: str) -> bool: if MENTION_RE.search(body or "") is not None: return True + if _body_mentions_token(body or ""): + return True mentions = content.get("m.mentions", {}) user_ids = mentions.get("user_ids", []) if not isinstance(user_ids, list): From 2398e287537b8c8da7c6e11ec7a1b09c3e1060f9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 18:16:14 -0300 Subject: [PATCH 242/416] atlasbot: improve worker readiness and metrics replies --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 140 +++++++++++++++++++++++- 2 files changed, 140 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index aa91dcb1..d5d8f06f 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-13 + checksum/atlasbot-configmap: manual-atlasbot-14 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f4182cd4..57549b37 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -441,7 +441,7 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data." summary = "\n".join(rendered_parts) context = f"Metrics (from {dashboard} / {panel}):\n{summary}" - fallback = f"{panel}: {summary}" + fallback = _metrics_fallback_summary(panel, summary) return context, fallback def jetson_nodes_from_kb() -> list[str]: @@ -654,6 +654,115 @@ def vm_render_result(res: dict | None, limit: int = 12) -> str: out.append(f"- {labels}: {val}") return "\n".join(out) +def _parse_metric_lines(summary: str) -> dict[str, str]: + parsed: dict[str, str] = {} + for line in (summary or "").splitlines(): + line = line.strip() + if not line.startswith("-"): + continue + try: + label, value = line.lstrip("-").split(":", 1) + except ValueError: + continue + parsed[label.strip()] = value.strip() + return parsed + +def _metrics_fallback_summary(panel: str, summary: str) -> str: + parsed = _parse_metric_lines(summary) + panel_l = (panel or "").lower() + if panel_l.startswith("postgres connections"): + used = parsed.get("conn=used") + maxv = parsed.get("conn=max") + if used and maxv: + try: + used_i = int(float(used)) + max_i = int(float(maxv)) + except ValueError: + return f"Postgres connections: {summary}" + free = max_i - used_i + return f"Postgres connections: {used_i}/{max_i} used ({free} free)." + if panel_l.startswith("postgres hottest"): + if parsed: + label, value = next(iter(parsed.items())) + return f"Most Postgres connections: {label} = {value}." + return f"{panel}: {summary}" + +def _node_ready_status(node: dict) -> bool | None: + conditions = node.get("status", {}).get("conditions") or [] + for cond in conditions if isinstance(conditions, list) else []: + if cond.get("type") == "Ready": + if cond.get("status") == "True": + return True + if cond.get("status") == "False": + return False + return None + return None + +def _node_is_worker(node: dict) -> bool: + labels = (node.get("metadata") or {}).get("labels") or {} + if labels.get("node-role.kubernetes.io/control-plane") is not None: + return False + if labels.get("node-role.kubernetes.io/master") is not None: + return False + if labels.get("node-role.kubernetes.io/worker") is not None: + return True + return True + +def worker_nodes_status() -> tuple[list[str], list[str]]: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return ([], []) + items = data.get("items") or [] + ready_nodes: list[str] = [] + not_ready_nodes: list[str] = [] + for node in items if isinstance(items, list) else []: + if not _node_is_worker(node): + continue + name = (node.get("metadata") or {}).get("name") or "" + if not name: + continue + ready = _node_ready_status(node) + if ready is True: + ready_nodes.append(name) + elif ready is False: + not_ready_nodes.append(name) + return (sorted(ready_nodes), sorted(not_ready_nodes)) + +def expected_nodes_from_kb() -> set[str]: + if not _NODE_CLASS_INDEX: + return set() + nodes = set().union(*_NODE_CLASS_INDEX.values()) + return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL} + +def missing_nodes_answer(cluster_name: str) -> str: + expected = expected_nodes_from_kb() + if not expected: + return "" + current = set() + try: + data = k8s_get("/api/v1/nodes?limit=500") + items = data.get("items") or [] + for node in items if isinstance(items, list) else []: + name = (node.get("metadata") or {}).get("name") or "" + if name: + current.add(name) + except Exception: + return "" + missing = sorted(expected - current) + if not missing: + return f"{cluster_name}: no missing nodes versus KB inventory." + return f"{cluster_name} missing nodes versus KB inventory: {', '.join(missing)}." + +def _should_short_circuit(prompt: str, fallback: str) -> bool: + if not fallback: + return False + lower = (prompt or "").lower() + for word in ("why", "explain", "architecture", "breakdown", "root cause", "plan"): + if word in lower: + return False + return True + def vm_top_restarts(hours: int = 1) -> str: q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))" res = vm_query(q) @@ -984,6 +1093,32 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue + if "worker" in lower_body and "node" in lower_body: + ready_nodes, not_ready_nodes = worker_nodes_status() + total = len(ready_nodes) + len(not_ready_nodes) + if total: + if any(word in lower_body for word in ("ready", "not ready", "unready")): + if not_ready_nodes: + send_msg( + token, + rid, + f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.", + ) + else: + send_msg(token, rid, f"All {len(ready_nodes)} worker nodes are Ready.") + continue + if any(word in lower_body for word in ("how many", "should")): + send_msg( + token, + rid, + f"Atlas has {total} worker nodes; {len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady.", + ) + continue + if "missing" in lower_body and "node" in lower_body: + missing = missing_nodes_answer("Atlas") + if missing: + send_msg(token, rid, missing) + continue inventory_answer = node_inventory_answer("Atlas", lower_body) if inventory_answer: send_msg(token, rid, inventory_answer) @@ -1046,6 +1181,9 @@ def sync_loop(token: str, room_id: str): fallback = node_inventory_answer("Atlas", lower_body) if metrics_fallback and not fallback: fallback = metrics_fallback + if _should_short_circuit(body, fallback): + send_msg(token, rid, fallback) + continue reply = ollama_reply_with_thinking( token, rid, From afa5d3bd2928dfce7aea05239c95a57f45795e55 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 18:18:42 -0300 Subject: [PATCH 243/416] atlasbot: improve worker node answers --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 16 +++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d5d8f06f..69aef2f8 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-14 + checksum/atlasbot-configmap: manual-atlasbot-15 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 57549b37..3b9082d8 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1097,6 +1097,7 @@ def sync_loop(token: str, room_id: str): ready_nodes, not_ready_nodes = worker_nodes_status() total = len(ready_nodes) + len(not_ready_nodes) if total: + missing_hint = missing_nodes_answer("Atlas") if any(word in lower_body for word in ("ready", "not ready", "unready")): if not_ready_nodes: send_msg( @@ -1105,14 +1106,19 @@ def sync_loop(token: str, room_id: str): f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.", ) else: - send_msg(token, rid, f"All {len(ready_nodes)} worker nodes are Ready.") + msg = f"All {len(ready_nodes)} worker nodes are Ready." + if missing_hint and "no missing" not in missing_hint: + msg += f" {missing_hint}" + send_msg(token, rid, msg) continue if any(word in lower_body for word in ("how many", "should")): - send_msg( - token, - rid, - f"Atlas has {total} worker nodes; {len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady.", + msg = ( + f"Atlas has {total} worker nodes; " + f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady." ) + if missing_hint and "no missing" not in missing_hint: + msg += f" {missing_hint}" + send_msg(token, rid, msg) continue if "missing" in lower_body and "node" in lower_body: missing = missing_nodes_answer("Atlas") From 70a095f5d06f0a4ffc05d391a48a6ef5ae77886c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 18:21:17 -0300 Subject: [PATCH 244/416] atlasbot: clarify worker count limits --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 69aef2f8..802021f8 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-15 + checksum/atlasbot-configmap: manual-atlasbot-16 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 3b9082d8..71537238 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1118,6 +1118,8 @@ def sync_loop(token: str, room_id: str): ) if missing_hint and "no missing" not in missing_hint: msg += f" {missing_hint}" + elif "should" in lower_body: + msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state." send_msg(token, rid, msg) continue if "missing" in lower_body and "node" in lower_body: From dce37f403512bac057c9c3427f486ff72cb7741d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 18:50:23 -0300 Subject: [PATCH 245/416] atlasbot: infer worker expected count from metrics --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 33 +++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 802021f8..b7843abd 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-16 + checksum/atlasbot-configmap: manual-atlasbot-17 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 71537238..bd40a9f9 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -191,6 +191,7 @@ _NODE_CLASS_AMD64: set[str] = set() _NODE_CLASS_JETSON: set[str] = set() _NODE_CLASS_EXTERNAL: set[str] = set() _NODE_CLASS_NON_RPI: set[str] = set() +NODE_REGEX = re.compile(r'node=~"([^"]+)"') def _load_json_file(path: str) -> Any | None: try: @@ -735,6 +736,23 @@ def expected_nodes_from_kb() -> set[str]: nodes = set().union(*_NODE_CLASS_INDEX.values()) return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL} +def expected_worker_nodes_from_metrics() -> list[str]: + for entry in _METRIC_INDEX: + panel = (entry.get("panel_title") or "").lower() + if "worker nodes ready" not in panel: + continue + exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else [] + for expr in exprs: + if not isinstance(expr, str): + continue + match = NODE_REGEX.search(expr) + if not match: + continue + raw = match.group(1) + nodes = [n.strip() for n in raw.split("|") if n.strip()] + return sorted(nodes) + return [] + def missing_nodes_answer(cluster_name: str) -> str: expected = expected_nodes_from_kb() if not expected: @@ -1098,6 +1116,8 @@ def sync_loop(token: str, room_id: str): total = len(ready_nodes) + len(not_ready_nodes) if total: missing_hint = missing_nodes_answer("Atlas") + expected_workers = expected_worker_nodes_from_metrics() + expected_total = len(expected_workers) if expected_workers else 0 if any(word in lower_body for word in ("ready", "not ready", "unready")): if not_ready_nodes: send_msg( @@ -1107,7 +1127,11 @@ def sync_loop(token: str, room_id: str): ) else: msg = f"All {len(ready_nodes)} worker nodes are Ready." - if missing_hint and "no missing" not in missing_hint: + if expected_total and len(ready_nodes) != expected_total: + missing = sorted(set(expected_workers) - set(ready_nodes)) + if missing: + msg += f" Missing: {', '.join(missing)}." + elif missing_hint and "no missing" not in missing_hint: msg += f" {missing_hint}" send_msg(token, rid, msg) continue @@ -1116,7 +1140,12 @@ def sync_loop(token: str, room_id: str): f"Atlas has {total} worker nodes; " f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady." ) - if missing_hint and "no missing" not in missing_hint: + if expected_total: + msg += f" Grafana inventory expects {expected_total} workers." + missing = sorted(set(expected_workers) - set(ready_nodes)) + if missing: + msg += f" Missing: {', '.join(missing)}." + elif missing_hint and "no missing" not in missing_hint: msg += f" {missing_hint}" elif "should" in lower_body: msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state." From 7cefb603e10232f77f8e97d410a8063d94d72a32 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:01:26 -0300 Subject: [PATCH 246/416] atlasbot: improve missing node inference --- services/comms/scripts/atlasbot/bot.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index bd40a9f9..7eb6dc77 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -754,6 +754,15 @@ def expected_worker_nodes_from_metrics() -> list[str]: return [] def missing_nodes_answer(cluster_name: str) -> str: + expected_workers = expected_worker_nodes_from_metrics() + if expected_workers: + ready_nodes, not_ready_nodes = worker_nodes_status() + current_workers = set(ready_nodes + not_ready_nodes) + missing = sorted(set(expected_workers) - current_workers) + if not missing: + return f"{cluster_name}: no missing worker nodes versus Grafana inventory." + return f"{cluster_name} missing worker nodes versus Grafana inventory: {', '.join(missing)}." + expected = expected_nodes_from_kb() if not expected: return "" @@ -1173,7 +1182,7 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue - if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body): + if re.search(r"\bnode names?\b|\bnodes?\b.*\bnamed\b|\bnaming\b", lower_body): if any(word in lower_body for word in ("cluster", "atlas", "titan")): names_summary = nodes_names_summary("Atlas") if not names_summary: @@ -1181,6 +1190,14 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, names_summary) continue + if re.search(r"\bwhich nodes are ready\b|\bnodes ready\b", lower_body): + ready_nodes, not_ready_nodes = worker_nodes_status() + if ready_nodes: + msg = f"Ready worker nodes ({len(ready_nodes)}): {', '.join(ready_nodes)}." + if not_ready_nodes: + msg += f" Not Ready: {', '.join(not_ready_nodes)}." + send_msg(token, rid, msg) + continue # Only do live cluster introspection in DMs; metrics can be answered when mentioned. allow_tools = is_dm From eae4521a44f74b8cdbf766aecc9b1270dbf80165 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:02:54 -0300 Subject: [PATCH 247/416] atlasbot: roll deployment --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index b7843abd..e45d9f3d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-17 + checksum/atlasbot-configmap: manual-atlasbot-18 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From f7a73dd9e3bce1b8dfe4dc1f0b986f7af042fa76 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:22:28 -0300 Subject: [PATCH 248/416] atlasbot: use live node inventory context --- services/comms/scripts/atlasbot/bot.py | 320 +++++++------------------ 1 file changed, 89 insertions(+), 231 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7eb6dc77..e070eadd 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -184,13 +184,6 @@ KB = {"catalog": {}, "runbooks": []} _HOST_INDEX: dict[str, list[dict]] = {} _NAME_INDEX: set[str] = set() _METRIC_INDEX: list[dict[str, Any]] = [] -_NODE_CLASS_INDEX: dict[str, list[str]] = {} -_NODE_CLASS_RPI4: set[str] = set() -_NODE_CLASS_RPI5: set[str] = set() -_NODE_CLASS_AMD64: set[str] = set() -_NODE_CLASS_JETSON: set[str] = set() -_NODE_CLASS_EXTERNAL: set[str] = set() -_NODE_CLASS_NON_RPI: set[str] = set() NODE_REGEX = re.compile(r'node=~"([^"]+)"') def _load_json_file(path: str) -> Any | None: @@ -202,8 +195,6 @@ def _load_json_file(path: str) -> Any | None: def load_kb(): global KB, _HOST_INDEX, _NAME_INDEX, _METRIC_INDEX - global _NODE_CLASS_INDEX, _NODE_CLASS_RPI4, _NODE_CLASS_RPI5, _NODE_CLASS_AMD64, _NODE_CLASS_JETSON - global _NODE_CLASS_EXTERNAL, _NODE_CLASS_NON_RPI if not KB_DIR: return catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {} @@ -228,24 +219,6 @@ def load_kb(): _NAME_INDEX = names _METRIC_INDEX = metrics if isinstance(metrics, list) else [] - node_classes = _parse_node_classes(runbooks) - _NODE_CLASS_INDEX = node_classes - _NODE_CLASS_RPI4 = set(node_classes.get("rpi4", [])) - _NODE_CLASS_RPI5 = set(node_classes.get("rpi5", [])) - _NODE_CLASS_AMD64 = set(node_classes.get("amd64", [])) - _NODE_CLASS_JETSON = set(node_classes.get("jetson", [])) - _NODE_CLASS_EXTERNAL = set(node_classes.get("external", [])) - _NODE_CLASS_NON_RPI = set( - sorted( - ( - set().union(*node_classes.values()) - - _NODE_CLASS_RPI4 - - _NODE_CLASS_RPI5 - - _NODE_CLASS_EXTERNAL - ) - ) - ) - def kb_retrieve(query: str, *, limit: int = 3) -> str: q = (query or "").strip() if not q or not KB.get("runbooks"): @@ -309,81 +282,92 @@ def _extract_titan_nodes(text: str) -> list[str]: names.add(f"titan-{right.lower()}") return sorted(names) -def _parse_node_classes(runbooks: list[dict[str, Any]]) -> dict[str, list[str]]: - classes: dict[str, list[str]] = {} - for doc in runbooks: - if not isinstance(doc, dict): - continue - body = str(doc.get("body") or "") - for line in body.splitlines(): - stripped = line.strip() - if "titan-" not in stripped.lower(): - continue - label = "" - nodes: list[str] = [] - if stripped.startswith("-") and ":" in stripped: - label, rest = stripped.lstrip("-").split(":", 1) - nodes = _extract_titan_nodes(rest) - label = label.strip().lower() - else: - nodes = _extract_titan_nodes(stripped) - if not nodes: - continue - if "jetson" in stripped.lower(): - classes.setdefault("jetson", nodes) - if "amd64" in stripped.lower() or "x86" in stripped.lower(): - classes.setdefault("amd64", nodes) - if "rpi4" in stripped.lower(): - classes.setdefault("rpi4", nodes) - if "rpi5" in stripped.lower(): - classes.setdefault("rpi5", nodes) - if "external" in stripped.lower() or "non-cluster" in stripped.lower(): - classes.setdefault("external", nodes) - if label: - classes.setdefault(label, nodes) - return {k: sorted(set(v)) for k, v in classes.items()} +def _node_roles(labels: dict[str, Any]) -> list[str]: + roles: list[str] = [] + for key in labels.keys(): + if key.startswith("node-role.kubernetes.io/"): + role = key.split("/", 1)[-1] + if role: + roles.append(role) + return sorted(set(roles)) -def node_inventory_answer(cluster_name: str, query: str) -> str: - q = (query or "").lower() - if "jetson" in q and _NODE_CLASS_JETSON: - names = sorted(_NODE_CLASS_JETSON) - return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}." - if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: - names = sorted(_NODE_CLASS_NON_RPI) - if names: - return f"{cluster_name} non‑Raspberry Pi nodes: {', '.join(names)}." - if "raspberry" in q or "rpi" in q: - if "rpi4" in q and _NODE_CLASS_RPI4: - names = sorted(_NODE_CLASS_RPI4) - return f"{cluster_name} rpi4 nodes: {', '.join(names)}." - if "rpi5" in q and _NODE_CLASS_RPI5: - names = sorted(_NODE_CLASS_RPI5) - return f"{cluster_name} rpi5 nodes: {', '.join(names)}." - names = sorted(_NODE_CLASS_RPI4 | _NODE_CLASS_RPI5) - if names: - return f"{cluster_name} Raspberry Pi nodes: {', '.join(names)}." - if ("amd64" in q or "x86" in q) and _NODE_CLASS_AMD64: - names = sorted(_NODE_CLASS_AMD64) - return f"{cluster_name} amd64 nodes: {', '.join(names)}." - return "" +def _hardware_class(labels: dict[str, Any]) -> str: + if str(labels.get("jetson") or "").lower() == "true": + return "jetson" + hardware = (labels.get("hardware") or "").strip().lower() + if hardware in ("rpi4", "rpi5"): + return hardware + arch = labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "" + if arch == "amd64": + return "amd64" + if arch == "arm64": + return "arm64-unknown" + return "unknown" + +def node_inventory_live() -> list[dict[str, Any]]: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return [] + items = data.get("items") or [] + inventory: list[dict[str, Any]] = [] + for node in items if isinstance(items, list) else []: + meta = node.get("metadata") or {} + labels = meta.get("labels") or {} + name = meta.get("name") or "" + if not name: + continue + inventory.append( + { + "name": name, + "arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "", + "hardware": _hardware_class(labels), + "roles": _node_roles(labels), + "ready": _node_ready_status(node), + } + ) + return sorted(inventory, key=lambda item: item["name"]) + +def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: + grouped: dict[str, list[str]] = collections.defaultdict(list) + for node in inventory: + grouped[node.get("hardware") or "unknown"].append(node["name"]) + return {k: sorted(v) for k, v in grouped.items()} def node_inventory_context(query: str) -> str: q = (query or "").lower() - if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "x86", "cluster")): + if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")): return "" - lines: list[str] = ["Node inventory (KB):"] - if _NODE_CLASS_RPI5: - lines.append(f"- rpi5: {', '.join(sorted(_NODE_CLASS_RPI5))}") - if _NODE_CLASS_RPI4: - lines.append(f"- rpi4: {', '.join(sorted(_NODE_CLASS_RPI4))}") - if _NODE_CLASS_JETSON: - lines.append(f"- jetson: {', '.join(sorted(_NODE_CLASS_JETSON))}") - if _NODE_CLASS_AMD64: - lines.append(f"- amd64: {', '.join(sorted(_NODE_CLASS_AMD64))}") - if _NODE_CLASS_EXTERNAL: - lines.append(f"- external: {', '.join(sorted(_NODE_CLASS_EXTERNAL))}") - if len(lines) == 1: + inventory = node_inventory_live() + if not inventory: return "" + groups = _group_nodes(inventory) + total = len(inventory) + ready = sum(1 for node in inventory if node.get("ready") is True) + not_ready = sum(1 for node in inventory if node.get("ready") is False) + lines: list[str] = [ + "Node inventory (live):", + f"- total: {total}, ready: {ready}, not ready: {not_ready}", + ] + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + if key in groups: + lines.append(f"- {key}: {', '.join(groups[key])}") + non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) + if non_rpi: + lines.append(f"- non_raspberry_pi (derived): {', '.join(non_rpi)}") + unknowns = groups.get("arm64-unknown", []) + groups.get("unknown", []) + if unknowns: + lines.append("- note: nodes labeled arm64-unknown/unknown may still be Raspberry Pi unless tagged.") + expected_workers = expected_worker_nodes_from_metrics() + if expected_workers: + ready_workers, not_ready_workers = worker_nodes_status() + missing = sorted(set(expected_workers) - set(ready_workers + not_ready_workers)) + lines.append(f"- expected_workers (grafana): {', '.join(expected_workers)}") + lines.append(f"- workers_ready: {', '.join(ready_workers)}") + if not_ready_workers: + lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}") + if missing: + lines.append(f"- workers_missing (derived): {', '.join(missing)}") return "\n".join(lines) def _metric_tokens(entry: dict[str, Any]) -> str: @@ -730,12 +714,6 @@ def worker_nodes_status() -> tuple[list[str], list[str]]: not_ready_nodes.append(name) return (sorted(ready_nodes), sorted(not_ready_nodes)) -def expected_nodes_from_kb() -> set[str]: - if not _NODE_CLASS_INDEX: - return set() - nodes = set().union(*_NODE_CLASS_INDEX.values()) - return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL} - def expected_worker_nodes_from_metrics() -> list[str]: for entry in _METRIC_INDEX: panel = (entry.get("panel_title") or "").lower() @@ -753,42 +731,13 @@ def expected_worker_nodes_from_metrics() -> list[str]: return sorted(nodes) return [] -def missing_nodes_answer(cluster_name: str) -> str: - expected_workers = expected_worker_nodes_from_metrics() - if expected_workers: - ready_nodes, not_ready_nodes = worker_nodes_status() - current_workers = set(ready_nodes + not_ready_nodes) - missing = sorted(set(expected_workers) - current_workers) - if not missing: - return f"{cluster_name}: no missing worker nodes versus Grafana inventory." - return f"{cluster_name} missing worker nodes versus Grafana inventory: {', '.join(missing)}." - - expected = expected_nodes_from_kb() - if not expected: +def _context_fallback(context: str) -> str: + if not context: return "" - current = set() - try: - data = k8s_get("/api/v1/nodes?limit=500") - items = data.get("items") or [] - for node in items if isinstance(items, list) else []: - name = (node.get("metadata") or {}).get("name") or "" - if name: - current.add(name) - except Exception: - return "" - missing = sorted(expected - current) - if not missing: - return f"{cluster_name}: no missing nodes versus KB inventory." - return f"{cluster_name} missing nodes versus KB inventory: {', '.join(missing)}." - -def _should_short_circuit(prompt: str, fallback: str) -> bool: - if not fallback: - return False - lower = (prompt or "").lower() - for word in ("why", "explain", "architecture", "breakdown", "root cause", "plan"): - if word in lower: - return False - return True + trimmed = context.strip() + if len(trimmed) > MAX_TOOL_CHARS: + trimmed = trimmed[: MAX_TOOL_CHARS - 3].rstrip() + "..." + return "I couldn’t reach the model backend. Here is the data I found:\n" + trimmed def vm_top_restarts(hours: int = 1) -> str: q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))" @@ -1112,92 +1061,6 @@ def sync_loop(token: str, room_id: str): continue lower_body = body.lower() - if re.search(r"\bhow many nodes\b|\bnode count\b|\bnumber of nodes\b", lower_body): - if any(word in lower_body for word in ("cluster", "atlas", "titan")): - summary = nodes_summary("Atlas") - if not summary: - send_msg(token, rid, "I couldn’t reach the cluster API to count nodes. Try again in a moment.") - continue - send_msg(token, rid, summary) - continue - if "worker" in lower_body and "node" in lower_body: - ready_nodes, not_ready_nodes = worker_nodes_status() - total = len(ready_nodes) + len(not_ready_nodes) - if total: - missing_hint = missing_nodes_answer("Atlas") - expected_workers = expected_worker_nodes_from_metrics() - expected_total = len(expected_workers) if expected_workers else 0 - if any(word in lower_body for word in ("ready", "not ready", "unready")): - if not_ready_nodes: - send_msg( - token, - rid, - f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.", - ) - else: - msg = f"All {len(ready_nodes)} worker nodes are Ready." - if expected_total and len(ready_nodes) != expected_total: - missing = sorted(set(expected_workers) - set(ready_nodes)) - if missing: - msg += f" Missing: {', '.join(missing)}." - elif missing_hint and "no missing" not in missing_hint: - msg += f" {missing_hint}" - send_msg(token, rid, msg) - continue - if any(word in lower_body for word in ("how many", "should")): - msg = ( - f"Atlas has {total} worker nodes; " - f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady." - ) - if expected_total: - msg += f" Grafana inventory expects {expected_total} workers." - missing = sorted(set(expected_workers) - set(ready_nodes)) - if missing: - msg += f" Missing: {', '.join(missing)}." - elif missing_hint and "no missing" not in missing_hint: - msg += f" {missing_hint}" - elif "should" in lower_body: - msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state." - send_msg(token, rid, msg) - continue - if "missing" in lower_body and "node" in lower_body: - missing = missing_nodes_answer("Atlas") - if missing: - send_msg(token, rid, missing) - continue - inventory_answer = node_inventory_answer("Atlas", lower_body) - if inventory_answer: - send_msg(token, rid, inventory_answer) - continue - if "node" in lower_body and any(word in lower_body for word in ("arm64", "aarch64", "amd64", "x86_64", "x86-64")): - if any(word in lower_body for word in ("cluster", "atlas", "titan")): - arch = "arm64" if "arm64" in lower_body or "aarch64" in lower_body else "amd64" - summary = nodes_arch_summary("Atlas", arch) - if not summary: - send_msg( - token, - rid, - "I couldn’t reach the cluster API to count nodes by architecture. Try again in a moment.", - ) - continue - send_msg(token, rid, summary) - continue - if re.search(r"\bnode names?\b|\bnodes?\b.*\bnamed\b|\bnaming\b", lower_body): - if any(word in lower_body for word in ("cluster", "atlas", "titan")): - names_summary = nodes_names_summary("Atlas") - if not names_summary: - send_msg(token, rid, "I couldn’t reach the cluster API to list node names. Try again in a moment.") - continue - send_msg(token, rid, names_summary) - continue - if re.search(r"\bwhich nodes are ready\b|\bnodes ready\b", lower_body): - ready_nodes, not_ready_nodes = worker_nodes_status() - if ready_nodes: - msg = f"Ready worker nodes ({len(ready_nodes)}): {', '.join(ready_nodes)}." - if not_ready_nodes: - msg += f" Not Ready: {', '.join(not_ready_nodes)}." - send_msg(token, rid, msg) - continue # Only do live cluster introspection in DMs; metrics can be answered when mentioned. allow_tools = is_dm @@ -1230,14 +1093,9 @@ def sync_loop(token: str, room_id: str): if metrics_context: context = (context + "\n\n" + metrics_context).strip() if context else metrics_context - fallback = "" - if "node" in lower_body or "cluster" in lower_body: - fallback = node_inventory_answer("Atlas", lower_body) - if metrics_fallback and not fallback: - fallback = metrics_fallback - if _should_short_circuit(body, fallback): - send_msg(token, rid, fallback) - continue + fallback = metrics_fallback or "" + if not fallback and context: + fallback = _context_fallback(context) reply = ollama_reply_with_thinking( token, rid, From e8ff0a5c223d7685edea42e0bcb7bc718d6e9002 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:24:03 -0300 Subject: [PATCH 249/416] atlasbot: reload for live inventory --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index e45d9f3d..4d5b31cc 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-18 + checksum/atlasbot-configmap: manual-atlasbot-19 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From fa08bd34f36fdde4727ec22b82728d20285d3ba1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:29:26 -0300 Subject: [PATCH 250/416] atlasbot: answer from live inventory --- services/comms/scripts/atlasbot/bot.py | 123 +++++++++++++++++++++++-- 1 file changed, 116 insertions(+), 7 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e070eadd..6fc654bd 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -334,11 +334,12 @@ def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: grouped[node.get("hardware") or "unknown"].append(node["name"]) return {k: sorted(v) for k, v in grouped.items()} -def node_inventory_context(query: str) -> str: +def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str: q = (query or "").lower() if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")): return "" - inventory = node_inventory_live() + if inventory is None: + inventory = node_inventory_live() if not inventory: return "" groups = _group_nodes(inventory) @@ -370,6 +371,101 @@ def node_inventory_context(query: str) -> str: lines.append(f"- workers_missing (derived): {', '.join(missing)}") return "\n".join(lines) +def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]: + q = (prompt or "").lower() + if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")): + return node_inventory_live() + return [] + +def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: + names = [node["name"] for node in inventory] + ready = [node["name"] for node in inventory if node.get("ready") is True] + not_ready = [node["name"] for node in inventory if node.get("ready") is False] + groups = _group_nodes(inventory) + return { + "names": sorted(names), + "ready": sorted(ready), + "not_ready": sorted(not_ready), + "groups": groups, + } + +def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str: + q = (prompt or "").lower() + if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")): + return metrics_summary + + if not inventory: + return "" + + sets = _inventory_sets(inventory) + names = sets["names"] + ready = sets["ready"] + not_ready = sets["not_ready"] + groups = sets["groups"] + total = len(names) + + for node in _extract_titan_nodes(q): + if node and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q): + if node in names: + return f"Yes. {node} is in the Atlas cluster." + return f"No. {node} is not in the Atlas cluster." + + if any(word in q for word in ("how many", "count", "number")) and "node" in q and "worker" not in q: + return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready." + + if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q: + return "Atlas node names: " + ", ".join(names) + "." + + if "ready" in q and "node" in q and "worker" in q: + if "not ready" in q or "unready" in q or "down" in q: + return "Worker nodes not ready: " + (", ".join(not_ready) if not_ready else "none") + "." + return "Ready worker nodes ({}): {}.".format(len(ready), ", ".join(ready)) + + if "worker" in q and any(word in q for word in ("missing", "expected", "should")): + expected_workers = expected_worker_nodes_from_metrics() + missing = sorted(set(expected_workers) - set(ready + not_ready)) if expected_workers else [] + if "missing" in q and missing: + return "Missing worker nodes: " + ", ".join(missing) + "." + if expected_workers: + msg = f"Grafana inventory expects {len(expected_workers)} workers." + if missing: + msg += f" Missing: {', '.join(missing)}." + return msg + return "No expected worker inventory found; using live cluster state." + + if "worker" in q and "node" in q and "ready" not in q and "missing" not in q: + return f"Worker nodes: {len(ready)} ready, {len(not_ready)} not ready." + + if "jetson" in q: + jets = groups.get("jetson", []) + return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found." + + if "amd64" in q or "x86" in q: + amd = groups.get("amd64", []) + return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found." + + if "rpi4" in q: + rpi4 = groups.get("rpi4", []) + return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found." + + if "rpi5" in q: + rpi5 = groups.get("rpi5", []) + return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found." + + if "raspberry" in q or "rpi" in q: + rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))) + return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found." + + if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: + non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) + return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found." + + if "arm64-unknown" in q or "unknown" in q: + unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))) + return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels." + + return "" + def _metric_tokens(entry: dict[str, Any]) -> str: parts: list[str] = [] for key in ("panel_title", "dashboard", "description"): @@ -900,7 +996,13 @@ history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] ( def key_for(room_id: str, sender: str, is_dm: bool): return (room_id, None) if is_dm else (room_id, sender) -def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, str]]) -> str: +def build_context( + prompt: str, + *, + allow_tools: bool, + targets: list[tuple[str, str]], + inventory: list[dict[str, Any]] | None = None, +) -> str: parts: list[str] = [] kb = kb_retrieve(prompt) @@ -911,9 +1013,9 @@ def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, st if endpoints: parts.append(endpoints) - inventory = node_inventory_context(prompt) - if inventory: - parts.append(inventory) + node_ctx = node_inventory_context(prompt, inventory) + if node_ctx: + parts.append(node_ctx) if allow_tools: # Scope pod summaries to relevant namespaces/workloads when possible. @@ -1083,7 +1185,8 @@ def sync_loop(token: str, room_id: str): if isinstance(w, dict) and w.get("name"): targets.append((ns, str(w["name"]))) - context = build_context(body, allow_tools=allow_tools, targets=targets) + inventory = node_inventory_for_prompt(body) + context = build_context(body, allow_tools=allow_tools, targets=targets, inventory=inventory) if allow_tools and promql: res = vm_query(promql, timeout=20) rendered = vm_render_result(res, limit=15) or "(no results)" @@ -1096,6 +1199,12 @@ def sync_loop(token: str, room_id: str): fallback = metrics_fallback or "" if not fallback and context: fallback = _context_fallback(context) + + structured = structured_answer(body, inventory=inventory, metrics_summary=metrics_fallback or "") + if structured: + send_msg(token, rid, structured) + continue + reply = ollama_reply_with_thinking( token, rid, From f09035e900a79f11c20742aea4bc25e0c8cd8aff Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:31:07 -0300 Subject: [PATCH 251/416] atlasbot: reload inventory answers --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4d5b31cc..57705ecc 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-19 + checksum/atlasbot-configmap: manual-atlasbot-20 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From a7ff5093da485007184eaadce6afc165486a0c0c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:34:19 -0300 Subject: [PATCH 252/416] atlasbot: generalize inventory answers --- services/comms/scripts/atlasbot/bot.py | 80 ++++++++++++++++---------- 1 file changed, 50 insertions(+), 30 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6fc654bd..d06645a5 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -382,11 +382,18 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: ready = [node["name"] for node in inventory if node.get("ready") is True] not_ready = [node["name"] for node in inventory if node.get("ready") is False] groups = _group_nodes(inventory) + workers = [node for node in inventory if "worker" in (node.get("roles") or [])] + worker_names = [node["name"] for node in workers] + worker_ready = [node["name"] for node in workers if node.get("ready") is True] + worker_not_ready = [node["name"] for node in workers if node.get("ready") is False] return { "names": sorted(names), "ready": sorted(ready), "not_ready": sorted(not_ready), "groups": groups, + "worker_names": sorted(worker_names), + "worker_ready": sorted(worker_ready), + "worker_not_ready": sorted(worker_not_ready), } def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str: @@ -402,6 +409,9 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s ready = sets["ready"] not_ready = sets["not_ready"] groups = sets["groups"] + worker_names = sets["worker_names"] + worker_ready = sets["worker_ready"] + worker_not_ready = sets["worker_not_ready"] total = len(names) for node in _extract_titan_nodes(q): @@ -410,31 +420,12 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s return f"Yes. {node} is in the Atlas cluster." return f"No. {node} is not in the Atlas cluster." - if any(word in q for word in ("how many", "count", "number")) and "node" in q and "worker" not in q: - return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready." - - if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q: - return "Atlas node names: " + ", ".join(names) + "." - - if "ready" in q and "node" in q and "worker" in q: - if "not ready" in q or "unready" in q or "down" in q: - return "Worker nodes not ready: " + (", ".join(not_ready) if not_ready else "none") + "." - return "Ready worker nodes ({}): {}.".format(len(ready), ", ".join(ready)) - - if "worker" in q and any(word in q for word in ("missing", "expected", "should")): - expected_workers = expected_worker_nodes_from_metrics() - missing = sorted(set(expected_workers) - set(ready + not_ready)) if expected_workers else [] - if "missing" in q and missing: - return "Missing worker nodes: " + ", ".join(missing) + "." - if expected_workers: - msg = f"Grafana inventory expects {len(expected_workers)} workers." - if missing: - msg += f" Missing: {', '.join(missing)}." - return msg - return "No expected worker inventory found; using live cluster state." - - if "worker" in q and "node" in q and "ready" not in q and "missing" not in q: - return f"Worker nodes: {len(ready)} ready, {len(not_ready)} not ready." + if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: + non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) + if "besides" in q: + amd = groups.get("amd64", []) + return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson." + return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found." if "jetson" in q: jets = groups.get("jetson", []) @@ -446,24 +437,53 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s if "rpi4" in q: rpi4 = groups.get("rpi4", []) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(rpi4)} rpi4 nodes." return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found." if "rpi5" in q: rpi5 = groups.get("rpi5", []) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(rpi5)} rpi5 nodes." return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found." if "raspberry" in q or "rpi" in q: rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(rpi)} Raspberry Pi nodes." return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found." - if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: - non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) - return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found." - - if "arm64-unknown" in q or "unknown" in q: + if "arm64-unknown" in q or "unknown" in q or "no hardware" in q: unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))) return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels." + if "worker" in q and "node" in q: + if any(word in q for word in ("missing", "expected", "should")): + expected_workers = expected_worker_nodes_from_metrics() + missing = sorted(set(expected_workers) - set(worker_ready + worker_not_ready)) if expected_workers else [] + if "missing" in q and missing: + return "Missing worker nodes: " + ", ".join(missing) + "." + if expected_workers: + msg = f"Grafana inventory expects {len(expected_workers)} workers." + if missing: + msg += f" Missing: {', '.join(missing)}." + return msg + return "No expected worker inventory found; using live cluster state." + if "not ready" in q or "unready" in q or "down" in q: + return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "." + if any(word in q for word in ("how many", "count", "number")): + return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready." + return "Ready worker nodes ({}): {}.".format(len(worker_ready), ", ".join(worker_ready)) + + if any(word in q for word in ("how many", "count", "number")) and "node" in q: + return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready." + + if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q: + return "Atlas node names: " + ", ".join(names) + "." + + if "ready" in q and "node" in q: + return f"Ready nodes ({len(ready)}): {', '.join(ready)}." + return "" def _metric_tokens(entry: dict[str, Any]) -> str: From d372bc10fbda956719aed002bae046013c2e1b46 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:34:42 -0300 Subject: [PATCH 253/416] atlasbot: reload structured answers --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 57705ecc..c723d22e 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-20 + checksum/atlasbot-configmap: manual-atlasbot-21 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 74c37832121201078f6d44cdf9d2c3c5e63fcf57 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:53:11 -0300 Subject: [PATCH 254/416] atlasbot: improve node inventory reasoning --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 154 +++++++++++++++++++----- 2 files changed, 122 insertions(+), 34 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index c723d22e..7cc66b32 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-21 + checksum/atlasbot-configmap: manual-atlasbot-22 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index d06645a5..6993db25 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -89,9 +89,17 @@ METRIC_HINT_WORDS = { "latency", } -CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL) -TITAN_NODE_RE = re.compile(r"\\btitan-[0-9a-z]{2}\\b", re.IGNORECASE) -TITAN_RANGE_RE = re.compile(r"\\btitan-([0-9a-z]{2})/([0-9a-z]{2})\\b", re.IGNORECASE) +CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) +TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE) +TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE) +_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D" + +def normalize_query(text: str) -> str: + cleaned = (text or "").lower() + for ch in _DASH_CHARS: + cleaned = cleaned.replace(ch, "-") + cleaned = re.sub(r"\s+", " ", cleaned).strip() + return cleaned def _tokens(text: str) -> list[str]: toks = [t.lower() for t in TOKEN_RE.findall(text or "")] @@ -267,14 +275,15 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: return "\n".join(parts).strip() def _extract_titan_nodes(text: str) -> list[str]: - names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n} - for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", text or "", re.IGNORECASE): + cleaned = normalize_query(text) + names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n} + for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", cleaned, re.IGNORECASE): tail = match.group(1) for part in re.split(r"[/,]", tail): part = part.strip() if part: names.add(f"titan-{part.lower()}") - for match in TITAN_RANGE_RE.finditer(text or ""): + for match in TITAN_RANGE_RE.finditer(cleaned): left, right = match.groups() if left: names.add(f"titan-{left.lower()}") @@ -323,6 +332,7 @@ def node_inventory_live() -> list[dict[str, Any]]: "arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "", "hardware": _hardware_class(labels), "roles": _node_roles(labels), + "is_worker": _node_is_worker(node), "ready": _node_ready_status(node), } ) @@ -335,7 +345,7 @@ def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: return {k: sorted(v) for k, v in grouped.items()} def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str: - q = (query or "").lower() + q = normalize_query(query) if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")): return "" if inventory is None: @@ -372,7 +382,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = return "\n".join(lines) def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]: - q = (prompt or "").lower() + q = normalize_query(prompt) if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")): return node_inventory_live() return [] @@ -382,10 +392,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: ready = [node["name"] for node in inventory if node.get("ready") is True] not_ready = [node["name"] for node in inventory if node.get("ready") is False] groups = _group_nodes(inventory) - workers = [node for node in inventory if "worker" in (node.get("roles") or [])] + workers = [node for node in inventory if node.get("is_worker") is True] worker_names = [node["name"] for node in workers] worker_ready = [node["name"] for node in workers if node.get("ready") is True] worker_not_ready = [node["name"] for node in workers if node.get("ready") is False] + expected_workers = expected_worker_nodes_from_metrics() + expected_ready = [n for n in expected_workers if n in ready] if expected_workers else [] + expected_not_ready = [n for n in expected_workers if n in not_ready] if expected_workers else [] + expected_missing = [n for n in expected_workers if n not in names] if expected_workers else [] return { "names": sorted(names), "ready": sorted(ready), @@ -394,10 +408,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: "worker_names": sorted(worker_names), "worker_ready": sorted(worker_ready), "worker_not_ready": sorted(worker_not_ready), + "expected_workers": expected_workers, + "expected_ready": sorted(expected_ready), + "expected_not_ready": sorted(expected_not_ready), + "expected_missing": sorted(expected_missing), } def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str: - q = (prompt or "").lower() + q = normalize_query(prompt) if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")): return metrics_summary @@ -412,29 +430,75 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s worker_names = sets["worker_names"] worker_ready = sets["worker_ready"] worker_not_ready = sets["worker_not_ready"] + expected_workers = sets["expected_workers"] + expected_ready = sets["expected_ready"] + expected_not_ready = sets["expected_not_ready"] + expected_missing = sets["expected_missing"] total = len(names) + nodes_in_query = _extract_titan_nodes(q) + rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", [])) + non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", [])) + unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])) - for node in _extract_titan_nodes(q): - if node and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q): + if nodes_in_query and ("raspberry" in q or "rpi" in q): + parts: list[str] = [] + for node in nodes_in_query: + if node in rpi_nodes: + parts.append(f"{node} is a Raspberry Pi node.") + elif node in non_rpi: + parts.append(f"{node} is not a Raspberry Pi node.") + elif node in names: + parts.append(f"{node} is in Atlas but hardware is unknown.") + else: + parts.append(f"{node} is not in the Atlas cluster.") + return " ".join(parts) + + if nodes_in_query and "jetson" in q: + jets = set(groups.get("jetson", [])) + parts = [] + for node in nodes_in_query: + if node in jets: + parts.append(f"{node} is a Jetson node.") + elif node in names: + parts.append(f"{node} is not a Jetson node.") + else: + parts.append(f"{node} is not in the Atlas cluster.") + return " ".join(parts) + + if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q): + parts: list[str] = [] + for node in nodes_in_query: if node in names: - return f"Yes. {node} is in the Atlas cluster." - return f"No. {node} is not in the Atlas cluster." + parts.append(f"Yes. {node} is in the Atlas cluster.") + else: + parts.append(f"No. {node} is not in the Atlas cluster.") + return " ".join(parts) - if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: - non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) - if "besides" in q: - amd = groups.get("amd64", []) + if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")): + non_rpi_sorted = sorted(non_rpi) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes." + if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")): + amd = sorted(groups.get("amd64", [])) return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson." - return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found." + return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found." if "jetson" in q: jets = groups.get("jetson", []) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(jets)} Jetson nodes." return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found." if "amd64" in q or "x86" in q: amd = groups.get("amd64", []) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(amd)} amd64 nodes." return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found." + if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")): + count = sum(1 for node in inventory if node.get("arch") == "arm64") + return f"Atlas has {count} arm64 nodes." + if "rpi4" in q: rpi4 = groups.get("rpi4", []) if any(word in q for word in ("how many", "count", "number")): @@ -448,28 +512,52 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found." if "raspberry" in q or "rpi" in q: - rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))) + rpi = sorted(rpi_nodes) if any(word in q for word in ("how many", "count", "number")): return f"Atlas has {len(rpi)} Raspberry Pi nodes." return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found." if "arm64-unknown" in q or "unknown" in q or "no hardware" in q: - unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))) + unknown = sorted(unknown_hw) return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels." - if "worker" in q and "node" in q: - if any(word in q for word in ("missing", "expected", "should")): - expected_workers = expected_worker_nodes_from_metrics() - missing = sorted(set(expected_workers) - set(worker_ready + worker_not_ready)) if expected_workers else [] - if "missing" in q and missing: - return "Missing worker nodes: " + ", ".join(missing) + "." - if expected_workers: - msg = f"Grafana inventory expects {len(expected_workers)} workers." - if missing: - msg += f" Missing: {', '.join(missing)}." + if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q): + return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "." + + if "worker" in q and ("node" in q or "nodes" in q or "workers" in q): + not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q) + if expected_workers: + if "missing" in q: + return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "." + if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q): + return ( + f"Expected workers: {len(expected_ready)} ready, " + f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})." + ) + if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q): + msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." + if expected_missing: + msg += f" Missing: {', '.join(expected_missing)}." return msg - return "No expected worker inventory found; using live cluster state." - if "not ready" in q or "unready" in q or "down" in q: + if not_ready_query: + if expected_not_ready or expected_missing: + detail = [] + if expected_not_ready: + detail.append(f"Not ready: {', '.join(expected_not_ready)}") + if expected_missing: + detail.append(f"Missing: {', '.join(expected_missing)}") + return "Worker nodes needing attention. " + " ".join(detail) + "." + return "All expected worker nodes are Ready." + if any(word in q for word in ("expected", "expect", "should")): + msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." + if expected_missing: + msg += f" Missing: {', '.join(expected_missing)}." + return msg + if any(word in q for word in ("how many", "count", "number")): + return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})." + if "ready" in q: + return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}." + if not_ready_query: return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "." if any(word in q for word in ("how many", "count", "number")): return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready." From f7d4425740fec1d957651e5a51b8a1d47364f0e0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 20:54:33 -0300 Subject: [PATCH 255/416] ariadne: reduce comms noise, fix gpu labels --- scripts/dashboards_render_atlas.py | 4 ++-- services/comms/mas-local-users-ensure-job.yaml | 2 +- services/comms/synapse-seeder-admin-ensure-job.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 2 +- services/monitoring/dashboards/atlas-overview.json | 2 +- services/monitoring/dcgm-exporter.yaml | 2 ++ services/monitoring/grafana-dashboard-overview.yaml | 2 +- 7 files changed, 9 insertions(+), 7 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 11479d9d..5aa77dc1 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -364,9 +364,9 @@ ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' ARIADNE_TEST_SUCCESS_RATE = ( "100 * " - 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[1h])) ' + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) ' "/ clamp_min(" - 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[1h])), 1)' + 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)' ) ARIADNE_TEST_FAILURES_24H = ( 'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/mas-local-users-ensure-job.yaml index 5802009a..c8cf5f04 100644 --- a/services/comms/mas-local-users-ensure-job.yaml +++ b/services/comms/mas-local-users-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: mas-local-users-ensure-15 + name: mas-local-users-ensure-16 namespace: comms spec: backoffLimit: 1 diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/synapse-seeder-admin-ensure-job.yaml index 99056586..ce8ccd35 100644 --- a/services/comms/synapse-seeder-admin-ensure-job.yaml +++ b/services/comms/synapse-seeder-admin-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: synapse-seeder-admin-ensure-7 + name: synapse-seeder-admin-ensure-8 namespace: comms spec: backoffLimit: 2 diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 0356e060..33b8a12a 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -306,7 +306,7 @@ spec: - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME value: "*/5 * * * *" - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE - value: "*/30 * * * *" + value: "0 0 1 * *" - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM value: "0 0 1 1 *" - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 2d7f3e51..486cd611 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1690,7 +1690,7 @@ }, "targets": [ { - "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)", + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)", "refId": "A" } ], diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index 8760c9f0..3e8d1a60 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -50,6 +50,8 @@ spec: env: - name: DCGM_EXPORTER_KUBERNETES value: "true" + - name: KUBERNETES_VIRTUAL_GPUS + value: "true" securityContext: privileged: true resources: diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 53361345..afc1e1fb 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1699,7 +1699,7 @@ data: }, "targets": [ { - "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)", + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)", "refId": "A" } ], From a68594972e50e36b0f3236be9f1f2d1272efa3b7 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Mon, 26 Jan 2026 23:54:53 +0000 Subject: [PATCH 256/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 3933caf6..2678a46b 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-49 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-50 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 6a5c9fb0e60c9d196d0e62397a8df4bfa5c7ae43 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 20:57:47 -0300 Subject: [PATCH 257/416] monitoring: map dcgm to shared gpu resources --- services/monitoring/dcgm-exporter.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index 3e8d1a60..ff5aed5a 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -52,6 +52,8 @@ spec: value: "true" - name: KUBERNETES_VIRTUAL_GPUS value: "true" + - name: NVIDIA_RESOURCE_NAMES + value: nvidia.com/gpu.shared securityContext: privileged: true resources: From 332c6bb6ba5e9972ff919e871424542598fd69ef Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:13:04 -0300 Subject: [PATCH 258/416] atlasbot: answer hottest node queries via metrics --- services/comms/scripts/atlasbot/bot.py | 94 ++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6993db25..233b25e9 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -18,6 +18,8 @@ OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") API_KEY = os.environ.get("CHAT_API_KEY", "") OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) +ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090")) +ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "") KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") @@ -93,6 +95,12 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE) TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE) _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D" +HOTTEST_QUERIES = { + "cpu": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "ram": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "net": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "io": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", +} def normalize_query(text: str) -> str: cleaned = (text or "").lower() @@ -291,6 +299,77 @@ def _extract_titan_nodes(text: str) -> list[str]: names.add(f"titan-{right.lower()}") return sorted(names) +def _humanize_rate(value: str, *, unit: str) -> str: + try: + val = float(value) + except (TypeError, ValueError): + return value + if unit == "%": + return f"{val:.1f}%" + if val >= 1024 * 1024: + return f"{val / (1024 * 1024):.2f} MB/s" + if val >= 1024: + return f"{val / 1024:.2f} KB/s" + return f"{val:.2f} B/s" + +def _hottest_query(metric: str, node_regex: str | None) -> str: + expr = HOTTEST_QUERIES[metric] + if node_regex: + needle = 'node_uname_info{nodename!=""}' + replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}' + return expr.replace(needle, replacement) + return expr + +def _vm_hottest(metric: str, node_regex: str | None) -> tuple[str, str] | None: + expr = _hottest_query(metric, node_regex) + res = vm_query(expr) + series = _vm_value_series(res) + if not series: + return None + first = series[0] + labels = first.get("metric") or {} + value = first.get("value") or [] + val = value[1] if isinstance(value, list) and len(value) > 1 else "" + node = labels.get("node") or labels.get("__name__") or "" + if not node: + return None + return (str(node), str(val)) + +def _hottest_answer(q: str, *, nodes: list[str] | None) -> str: + metric = None + assumed_cpu = False + if "cpu" in q: + metric = "cpu" + elif "ram" in q or "memory" in q: + metric = "ram" + elif "net" in q or "network" in q: + metric = "net" + elif "io" in q or "disk" in q or "storage" in q: + metric = "io" + if metric is None: + metric = "cpu" + assumed_cpu = True + if nodes is not None and not nodes: + return "No nodes match the requested hardware class." + + node_regex = "|".join(nodes) if nodes else None + metrics = [metric] + lines: list[str] = [] + for m in metrics: + picked = _vm_hottest(m, node_regex) + if not picked: + continue + node, val = picked + unit = "%" if m in ("cpu", "ram") else "B/s" + val_str = _humanize_rate(val, unit=unit) + label = {"cpu": "CPU", "ram": "RAM", "net": "NET", "io": "I/O"}[m] + lines.append(f"{label}: {node} ({val_str})") + if not lines: + return "" + label = metric.upper() + suffix = " (defaulting to CPU)" if assumed_cpu else "" + return f"Hottest node by {label}: {lines[0].split(': ', 1)[1]}.{suffix}" + def _node_roles(labels: dict[str, Any]) -> list[str]: roles: list[str] = [] for key in labels.keys(): @@ -440,6 +519,21 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", [])) unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])) + if "hottest" in q or "hot" in q: + filter_nodes: list[str] | None = None + if "amd64" in q or "x86" in q: + filter_nodes = sorted(groups.get("amd64", [])) + elif "jetson" in q: + filter_nodes = sorted(groups.get("jetson", [])) + elif "raspberry" in q or "rpi" in q: + filter_nodes = sorted(rpi_nodes) + elif "arm64" in q: + filter_nodes = sorted([n for n in names if n not in groups.get("amd64", [])]) + hottest = _hottest_answer(q, nodes=filter_nodes) + if hottest: + return hottest + return "Unable to determine hottest nodes right now (metrics unavailable)." + if nodes_in_query and ("raspberry" in q or "rpi" in q): parts: list[str] = [] for node in nodes_in_query: From b5e5507ff04f47d58d80a775abfb2c79cd6b9cb6 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:13:53 -0300 Subject: [PATCH 259/416] comms: restart atlasbot for hottest node fix --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7cc66b32..d5ad62eb 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-22 + checksum/atlasbot-configmap: manual-atlasbot-23 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 6062e266aa496d9f27451d0d702b8861744f5fcd Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:26:13 -0300 Subject: [PATCH 260/416] vault: allow ariadne to use vault-admin role --- services/vault/scripts/vault_k8s_auth_configure.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index a956e0e5..21132c79 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -193,8 +193,8 @@ path "kv/data/atlas/shared/*" { write_raw_policy "dev-kv" "${dev_kv_policy}" log "writing role vault-admin" vault_cmd write "auth/kubernetes/role/vault-admin" \ - bound_service_account_names="vault-admin" \ - bound_service_account_namespaces="vault" \ + bound_service_account_names="vault-admin,ariadne" \ + bound_service_account_namespaces="vault,maintenance" \ policies="vault-admin" \ ttl="${role_ttl}" From 995050f544383412a5cb6544c7039d0119c2fded Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:26:24 -0300 Subject: [PATCH 261/416] monitoring: unify jetson gpu metrics --- scripts/dashboards_render_atlas.py | 33 ++++++++++++++++++- services/monitoring/dashboards/atlas-gpu.json | 4 +-- .../monitoring/dashboards/atlas-overview.json | 2 +- .../monitoring/grafana-dashboard-gpu.yaml | 4 +-- .../grafana-dashboard-overview.yaml | 2 +- .../jetson-tegrastats-exporter.yaml | 4 +++ .../scripts/jetson_tegrastats_exporter.py | 4 ++- 7 files changed, 45 insertions(+), 8 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 5aa77dc1..675fec52 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -208,7 +208,38 @@ def namespace_ram_raw(scope_var): def namespace_gpu_usage_instant(scope_var): - return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)" + dcgm = f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)" + jetson = jetson_gpu_usage_by_namespace(scope_var) + merged = ( + f'label_replace({dcgm}, "source", "dcgm", "", "") ' + f'or label_replace({jetson}, "source", "jetson", "", "")' + ) + return f"sum by (namespace) ({merged})" + + +def jetson_gpu_util_by_node(): + return 'max by (node) (jetson_gr3d_freq_percent{node!=""})' + + +def jetson_gpu_requests(scope_var): + return ( + "sum by (namespace,node) (" + f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} ' + "* on(namespace,pod) group_left(node) kube_pod_info " + '* on(node) group_left(label_jetson) kube_node_labels{label_jetson="true"}' + ")" + ) + + +def jetson_gpu_usage_by_namespace(scope_var): + requests_by_ns = jetson_gpu_requests(scope_var) + total_by_node = f"sum by (node) ({requests_by_ns})" + return ( + "sum by (namespace) (" + f"({requests_by_ns}) / clamp_min({total_by_node}, 1) " + f"* on(node) group_left() {jetson_gpu_util_by_node()}" + ")" + ) def namespace_share_expr(resource_expr): diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index af8a1c5b..6b76a5c2 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -89,7 +89,7 @@ }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)", + "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 486cd611..04352f93 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1901,7 +1901,7 @@ }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index d7950f2b..46b25cd0 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -98,7 +98,7 @@ data: }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)", + "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index afc1e1fb..9495647f 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1910,7 +1910,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 8788b206..a8354014 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -44,6 +44,10 @@ spec: env: - name: JETSON_EXPORTER_PORT value: "9100" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName volumeMounts: - name: script mountPath: /etc/tegrastats-exporter diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index cd557e7c..c4d3fa2a 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -7,6 +7,7 @@ import threading from time import time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) +NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename METRICS = { "gr3d_freq_percent": 0.0, "gpu_temp_c": 0.0, @@ -60,9 +61,10 @@ class Handler(http.server.BaseHTTPRequestHandler): with LOCK: metrics = METRICS.copy() out = [] + label = f'{{node="{NODE_NAME}"}}' for k, v in metrics.items(): out.append(f"# TYPE jetson_{k} gauge") - out.append(f"jetson_{k} {v}") + out.append(f"jetson_{k}{label} {v}") body = "\\n".join(out) + "\\n" self.send_response(200) self.send_header("Content-Type", "text/plain; version=0.0.4") From 094b2aede691c792b53dd29294c5ac8b53d6d835 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 01:27:02 +0000 Subject: [PATCH 262/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 2678a46b..7528f6f3 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-50 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-51 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 25046d0c86faf2833027f1e3ac58bf4bd2e6b34b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:38:18 -0300 Subject: [PATCH 263/416] atlasbot: replace targeted handlers with generic planner --- services/comms/scripts/atlasbot/bot.py | 573 ++++++++++--------------- 1 file changed, 235 insertions(+), 338 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 233b25e9..987df7a1 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -95,11 +95,29 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE) TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE) _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D" -HOTTEST_QUERIES = { - "cpu": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", - "ram": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", - "net": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", - "io": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + +OPERATION_HINTS = { + "count": ("how many", "count", "number", "total"), + "list": ("list", "which", "what are", "show", "names"), + "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum"), + "status": ("ready", "not ready", "unready", "down", "missing", "status"), +} + +METRIC_HINTS = { + "cpu": ("cpu",), + "ram": ("ram", "memory", "mem"), + "net": ("net", "network", "bandwidth", "throughput"), + "io": ("io", "disk", "storage"), + "connections": ("connections", "conn", "postgres", "database", "db"), +} + +HARDWARE_HINTS = { + "amd64": ("amd64", "x86", "x86_64", "x86-64"), + "jetson": ("jetson",), + "rpi4": ("rpi4",), + "rpi5": ("rpi5",), + "rpi": ("rpi", "raspberry"), + "arm64": ("arm64", "aarch64"), } def normalize_query(text: str) -> str: @@ -312,63 +330,127 @@ def _humanize_rate(value: str, *, unit: str) -> str: return f"{val / 1024:.2f} KB/s" return f"{val:.2f} B/s" -def _hottest_query(metric: str, node_regex: str | None) -> str: - expr = HOTTEST_QUERIES[metric] - if node_regex: - needle = 'node_uname_info{nodename!=""}' - replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}' - return expr.replace(needle, replacement) - return expr +def _has_any(text: str, phrases: tuple[str, ...]) -> bool: + return any(p in text for p in phrases) -def _vm_hottest(metric: str, node_regex: str | None) -> tuple[str, str] | None: - expr = _hottest_query(metric, node_regex) - res = vm_query(expr) - series = _vm_value_series(res) - if not series: - return None - first = series[0] - labels = first.get("metric") or {} - value = first.get("value") or [] - val = value[1] if isinstance(value, list) and len(value) > 1 else "" - node = labels.get("node") or labels.get("__name__") or "" - if not node: - return None - return (str(node), str(val)) +def _detect_operation(q: str) -> str | None: + for op, phrases in OPERATION_HINTS.items(): + if _has_any(q, phrases): + return op + return None -def _hottest_answer(q: str, *, nodes: list[str] | None) -> str: - metric = None - assumed_cpu = False - if "cpu" in q: - metric = "cpu" - elif "ram" in q or "memory" in q: - metric = "ram" - elif "net" in q or "network" in q: - metric = "net" - elif "io" in q or "disk" in q or "storage" in q: - metric = "io" - if metric is None: - metric = "cpu" - assumed_cpu = True - if nodes is not None and not nodes: - return "No nodes match the requested hardware class." +def _detect_metric(q: str) -> str | None: + for metric, phrases in METRIC_HINTS.items(): + if _has_any(q, phrases): + return metric + return None - node_regex = "|".join(nodes) if nodes else None - metrics = [metric] - lines: list[str] = [] - for m in metrics: - picked = _vm_hottest(m, node_regex) - if not picked: +def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: + include: set[str] = set() + exclude: set[str] = set() + for hardware, phrases in HARDWARE_HINTS.items(): + for phrase in phrases: + if f"non {phrase}" in q or f"non-{phrase}" in q or f"not {phrase}" in q: + exclude.add(hardware) + elif phrase in q: + include.add(hardware) + return include, exclude + +def _detect_entity(q: str) -> str | None: + if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q): + return "node" + if "pod" in q or "pods" in q: + return "pod" + if "namespace" in q or "namespaces" in q: + return "namespace" + return None + +def _metric_entry_score(entry: dict[str, Any], tokens: list[str], *, metric: str | None, op: str | None) -> int: + hay = _metric_tokens(entry) + score = 0 + for t in set(tokens): + if t in hay: + score += 2 if t in (entry.get("panel_title") or "").lower() else 1 + if metric: + for phrase in METRIC_HINTS.get(metric, (metric,)): + if phrase in hay: + score += 3 + if op == "top" and ("hottest" in hay or "top" in hay): + score += 3 + if "node" in hay: + score += 1 + return score + +def _select_metric_entry(tokens: list[str], *, metric: str | None, op: str | None) -> dict[str, Any] | None: + scored: list[tuple[int, dict[str, Any]]] = [] + for entry in _METRIC_INDEX: + if not isinstance(entry, dict): continue - node, val = picked - unit = "%" if m in ("cpu", "ram") else "B/s" - val_str = _humanize_rate(val, unit=unit) - label = {"cpu": "CPU", "ram": "RAM", "net": "NET", "io": "I/O"}[m] - lines.append(f"{label}: {node} ({val_str})") - if not lines: + score = _metric_entry_score(entry, tokens, metric=metric, op=op) + if score: + scored.append((score, entry)) + if not scored: + return None + scored.sort(key=lambda item: item[0], reverse=True) + return scored[0][1] + +def _apply_node_filter(expr: str, node_regex: str | None) -> str: + if not node_regex: + return expr + needle = 'node_uname_info{nodename!=""}' + replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}' + return expr.replace(needle, replacement) + +def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str: + series = _vm_value_series(res) + panel = entry.get("panel_title") or "Metric" + if not series: return "" - label = metric.upper() - suffix = " (defaulting to CPU)" if assumed_cpu else "" - return f"Hottest node by {label}: {lines[0].split(': ', 1)[1]}.{suffix}" + rendered = vm_render_result(res, limit=5) + if not rendered: + return "" + lines = [line.lstrip("-").strip() for line in rendered.splitlines() if line.strip().startswith("-")] + if len(lines) == 1: + return f"{panel}: {lines[0]}." + return f"{panel}:\n" + "\n".join(f"- {line}" for line in lines) + +def _inventory_filter( + inventory: list[dict[str, Any]], + *, + include_hw: set[str], + exclude_hw: set[str], + only_workers: bool, + only_ready: bool | None, + nodes_in_query: list[str], +) -> list[dict[str, Any]]: + results = inventory + if nodes_in_query: + results = [node for node in results if node.get("name") in nodes_in_query] + if only_workers: + results = [node for node in results if node.get("is_worker") is True] + if only_ready is True: + results = [node for node in results if node.get("ready") is True] + if only_ready is False: + results = [node for node in results if node.get("ready") is False] + if include_hw: + results = [node for node in results if _hardware_match(node, include_hw)] + if exclude_hw: + results = [node for node in results if not _hardware_match(node, exclude_hw)] + return results + +def _hardware_match(node: dict[str, Any], filters: set[str]) -> bool: + hw = node.get("hardware") or "" + arch = node.get("arch") or "" + for f in filters: + if f == "rpi" and hw in ("rpi4", "rpi5"): + return True + if f == "arm64" and arch == "arm64": + return True + if hw == f: + return True + if f == "amd64" and arch == "amd64": + return True + return False def _node_roles(labels: dict[str, Any]) -> list[str]: roles: list[str] = [] @@ -495,176 +577,103 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str: q = normalize_query(prompt) - if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")): - return metrics_summary - - if not inventory: + if not q: return "" - sets = _inventory_sets(inventory) - names = sets["names"] - ready = sets["ready"] - not_ready = sets["not_ready"] - groups = sets["groups"] - worker_names = sets["worker_names"] - worker_ready = sets["worker_ready"] - worker_not_ready = sets["worker_not_ready"] - expected_workers = sets["expected_workers"] - expected_ready = sets["expected_ready"] - expected_not_ready = sets["expected_not_ready"] - expected_missing = sets["expected_missing"] - total = len(names) + tokens = _tokens(q) + op = _detect_operation(q) + metric = _detect_metric(q) + entity = _detect_entity(q) + include_hw, exclude_hw = _detect_hardware_filters(q) nodes_in_query = _extract_titan_nodes(q) - rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", [])) - non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", [])) - unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])) + only_workers = "worker" in q or "workers" in q + only_ready: bool | None = None + if "not ready" in q or "unready" in q or "down" in q or "missing" in q: + only_ready = False + elif "ready" in q: + only_ready = True - if "hottest" in q or "hot" in q: - filter_nodes: list[str] | None = None - if "amd64" in q or "x86" in q: - filter_nodes = sorted(groups.get("amd64", [])) - elif "jetson" in q: - filter_nodes = sorted(groups.get("jetson", [])) - elif "raspberry" in q or "rpi" in q: - filter_nodes = sorted(rpi_nodes) - elif "arm64" in q: - filter_nodes = sorted([n for n in names if n not in groups.get("amd64", [])]) - hottest = _hottest_answer(q, nodes=filter_nodes) - if hottest: - return hottest - return "Unable to determine hottest nodes right now (metrics unavailable)." + if entity == "node" and only_ready is not None and op != "count": + op = "status" - if nodes_in_query and ("raspberry" in q or "rpi" in q): - parts: list[str] = [] - for node in nodes_in_query: - if node in rpi_nodes: - parts.append(f"{node} is a Raspberry Pi node.") - elif node in non_rpi: - parts.append(f"{node} is not a Raspberry Pi node.") - elif node in names: - parts.append(f"{node} is in Atlas but hardware is unknown.") - else: - parts.append(f"{node} is not in the Atlas cluster.") - return " ".join(parts) + if not op and entity == "node": + op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count" - if nodes_in_query and "jetson" in q: - jets = set(groups.get("jetson", [])) - parts = [] - for node in nodes_in_query: - if node in jets: - parts.append(f"{node} is a Jetson node.") - elif node in names: - parts.append(f"{node} is not a Jetson node.") - else: - parts.append(f"{node} is not in the Atlas cluster.") - return " ".join(parts) + if op == "top" and metric is None: + metric = "cpu" - if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q): - parts: list[str] = [] - for node in nodes_in_query: - if node in names: - parts.append(f"Yes. {node} is in the Atlas cluster.") - else: - parts.append(f"No. {node} is not in the Atlas cluster.") - return " ".join(parts) - - if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")): - non_rpi_sorted = sorted(non_rpi) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes." - if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")): - amd = sorted(groups.get("amd64", [])) - return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson." - return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found." - - if "jetson" in q: - jets = groups.get("jetson", []) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(jets)} Jetson nodes." - return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found." - - if "amd64" in q or "x86" in q: - amd = groups.get("amd64", []) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(amd)} amd64 nodes." - return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found." - - if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")): - count = sum(1 for node in inventory if node.get("arch") == "arm64") - return f"Atlas has {count} arm64 nodes." - - if "rpi4" in q: - rpi4 = groups.get("rpi4", []) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(rpi4)} rpi4 nodes." - return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found." - - if "rpi5" in q: - rpi5 = groups.get("rpi5", []) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(rpi5)} rpi5 nodes." - return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found." - - if "raspberry" in q or "rpi" in q: - rpi = sorted(rpi_nodes) - if any(word in q for word in ("how many", "count", "number")): - return f"Atlas has {len(rpi)} Raspberry Pi nodes." - return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found." - - if "arm64-unknown" in q or "unknown" in q or "no hardware" in q: - unknown = sorted(unknown_hw) - return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels." - - if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q): - return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "." - - if "worker" in q and ("node" in q or "nodes" in q or "workers" in q): - not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q) - if expected_workers: - if "missing" in q: - return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "." - if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q): - return ( - f"Expected workers: {len(expected_ready)} ready, " - f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})." + # Metrics-first when a metric or top operation is requested. + if metric or op == "top": + entry = _select_metric_entry(tokens, metric=metric, op=op) + if entry and isinstance(entry.get("exprs"), list) and entry["exprs"]: + expr = entry["exprs"][0] + if inventory: + scoped = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=only_workers, + only_ready=None, + nodes_in_query=nodes_in_query, ) - if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q): - msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." - if expected_missing: - msg += f" Missing: {', '.join(expected_missing)}." - return msg - if not_ready_query: - if expected_not_ready or expected_missing: - detail = [] - if expected_not_ready: - detail.append(f"Not ready: {', '.join(expected_not_ready)}") - if expected_missing: - detail.append(f"Missing: {', '.join(expected_missing)}") - return "Worker nodes needing attention. " + " ".join(detail) + "." - return "All expected worker nodes are Ready." - if any(word in q for word in ("expected", "expect", "should")): - msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." - if expected_missing: - msg += f" Missing: {', '.join(expected_missing)}." - return msg - if any(word in q for word in ("how many", "count", "number")): - return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})." - if "ready" in q: - return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}." - if not_ready_query: - return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "." - if any(word in q for word in ("how many", "count", "number")): - return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready." - return "Ready worker nodes ({}): {}.".format(len(worker_ready), ", ".join(worker_ready)) + if scoped: + node_regex = "|".join([n["name"] for n in scoped]) + expr = _apply_node_filter(expr, node_regex) + res = vm_query(expr, timeout=20) + answer = _format_metric_answer(entry, res) + if answer: + return answer + if metrics_summary: + return metrics_summary - if any(word in q for word in ("how many", "count", "number")) and "node" in q: - return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready." + if entity != "node" or not inventory: + if any(word in q for word in METRIC_HINT_WORDS) and not metrics_summary: + return "I don't have data to answer that right now." + return "" - if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q: - return "Atlas node names: " + ", ".join(names) + "." + expected_workers = expected_worker_nodes_from_metrics() + filtered = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=only_workers, + only_ready=only_ready if op in ("status", "count") else None, + nodes_in_query=nodes_in_query, + ) + names = [node["name"] for node in filtered] - if "ready" in q and "node" in q: - return f"Ready nodes ({len(ready)}): {', '.join(ready)}." + if op == "status": + if "missing" in q and expected_workers: + missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) + return "Missing nodes: " + (", ".join(missing) if missing else "none") + "." + if only_ready is False: + return "Not ready nodes: " + (", ".join(names) if names else "none") + "." + if only_ready is True: + return f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + "." + + if op == "count": + if expected_workers and ("expected" in q or "should" in q): + missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) + msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." + if missing: + msg += f" Missing: {', '.join(missing)}." + return msg + if not (include_hw or exclude_hw or nodes_in_query or only_workers): + return f"Atlas has {len(names)} nodes." + return f"Matching nodes: {len(names)}." + + if op == "list": + if nodes_in_query: + parts = [] + existing = {n["name"] for n in inventory} + for node in nodes_in_query: + parts.append(f"{node}: {'present' if node in existing else 'not present'}") + return "Node presence: " + ", ".join(parts) + "." + if not names: + return "Matching nodes: none." + shown = names[:30] + suffix = f", … (+{len(names) - 30} more)" if len(names) > 30 else "" + return "Matching nodes: " + ", ".join(shown) + suffix + "." return "" @@ -727,25 +736,6 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: fallback = _metrics_fallback_summary(panel, summary) return context, fallback -def jetson_nodes_from_kb() -> list[str]: - for doc in KB.get("runbooks", []): - if not isinstance(doc, dict): - continue - body = str(doc.get("body") or "") - for line in body.splitlines(): - if "jetson" not in line.lower(): - continue - names = _extract_titan_nodes(line) - if names: - return names - return [] - -def jetson_nodes_summary(cluster_name: str) -> str: - names = jetson_nodes_from_kb() - if names: - return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}." - return "" - def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]: q = (query or "").strip() if not q or not KB.get("catalog"): @@ -953,22 +943,16 @@ def _parse_metric_lines(summary: str) -> dict[str, str]: def _metrics_fallback_summary(panel: str, summary: str) -> str: parsed = _parse_metric_lines(summary) panel_l = (panel or "").lower() - if panel_l.startswith("postgres connections"): - used = parsed.get("conn=used") - maxv = parsed.get("conn=max") - if used and maxv: - try: - used_i = int(float(used)) - max_i = int(float(maxv)) - except ValueError: - return f"Postgres connections: {summary}" - free = max_i - used_i - return f"Postgres connections: {used_i}/{max_i} used ({free} free)." - if panel_l.startswith("postgres hottest"): - if parsed: - label, value = next(iter(parsed.items())) - return f"Most Postgres connections: {label} = {value}." - return f"{panel}: {summary}" + if parsed: + items = list(parsed.items()) + if len(items) == 1: + label, value = items[0] + return f"{panel}: {label} = {value}." + compact = "; ".join(f"{k}={v}" for k, v in items) + return f"{panel}: {compact}." + if panel_l: + return f"{panel}: {summary}" + return summary def _node_ready_status(node: dict) -> bool | None: conditions = node.get("status", {}).get("conditions") or [] @@ -1075,93 +1059,6 @@ def vm_cluster_snapshot() -> str: parts.append(pr) return "\n".join(parts).strip() -def nodes_summary(cluster_name: str) -> str: - state = _ariadne_state() - if state: - nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {} - total = nodes.get("total") - ready = nodes.get("ready") - not_ready = nodes.get("not_ready") - if isinstance(total, int) and isinstance(ready, int): - not_ready = not_ready if isinstance(not_ready, int) else max(total - ready, 0) - if not_ready: - return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady." - return f"{cluster_name} cluster has {total} nodes, all Ready." - try: - data = k8s_get("/api/v1/nodes?limit=500") - except Exception: - return "" - items = data.get("items") or [] - if not isinstance(items, list) or not items: - return "" - total = len(items) - ready = 0 - for node in items: - conditions = node.get("status", {}).get("conditions") or [] - for cond in conditions if isinstance(conditions, list) else []: - if cond.get("type") == "Ready": - if cond.get("status") == "True": - ready += 1 - break - not_ready = max(total - ready, 0) - if not_ready: - return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady." - return f"{cluster_name} cluster has {total} nodes, all Ready." - -def nodes_names_summary(cluster_name: str) -> str: - state = _ariadne_state() - if state: - nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {} - names = nodes.get("names") - if isinstance(names, list) and names: - cleaned = sorted({str(n) for n in names if n}) - if len(cleaned) <= 30: - return f"{cluster_name} node names: {', '.join(cleaned)}." - shown = ", ".join(cleaned[:30]) - return f"{cluster_name} node names: {shown}, … (+{len(cleaned) - 30} more)." - try: - data = k8s_get("/api/v1/nodes?limit=500") - except Exception: - return "" - items = data.get("items") or [] - if not isinstance(items, list) or not items: - return "" - names = [] - for node in items: - name = (node.get("metadata") or {}).get("name") or "" - if name: - names.append(name) - names = sorted(set(names)) - if not names: - return "" - if len(names) <= 30: - return f"{cluster_name} node names: {', '.join(names)}." - shown = ", ".join(names[:30]) - return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)." - - -def nodes_arch_summary(cluster_name: str, arch: str) -> str: - try: - data = k8s_get("/api/v1/nodes?limit=500") - except Exception: - return "" - items = data.get("items") or [] - if not isinstance(items, list) or not items: - return "" - normalized = (arch or "").strip().lower() - if normalized in ("aarch64", "arm64"): - arch_label = "arm64" - elif normalized in ("x86_64", "x86-64", "amd64"): - arch_label = "amd64" - else: - arch_label = normalized - total = 0 - for node in items: - labels = (node.get("metadata") or {}).get("labels") or {} - if labels.get("kubernetes.io/arch") == arch_label: - total += 1 - return f"{cluster_name} cluster has {total} {arch_label} nodes." - def _strip_code_fence(text: str) -> str: cleaned = (text or "").strip() match = CODE_FENCE_RE.match(cleaned) From cb8bd3375e70b6f3f55fbb420327f7e4e62e3cbc Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:39:01 -0300 Subject: [PATCH 264/416] comms: restart atlasbot for generic planner --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d5ad62eb..d195e890 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-23 + checksum/atlasbot-configmap: manual-atlasbot-24 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From d113954f73cd4f4d078f359423261aa0afe796f4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:43:58 -0300 Subject: [PATCH 265/416] atlasbot: add internal endpoint and portal wiring --- .../bstein-dev-home/backend-deployment.yaml | 5 ++ services/comms/atlasbot-deployment.yaml | 7 ++- services/comms/atlasbot-service.yaml | 15 +++++ services/comms/kustomization.yaml | 1 + services/comms/scripts/atlasbot/bot.py | 58 +++++++++++++++++++ 5 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 services/comms/atlasbot-service.yaml diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index ecf478cc..26c99e11 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -28,6 +28,7 @@ spec: {{ with secret "kv/data/atlas/shared/chat-ai-keys-runtime" }} export CHAT_KEY_MATRIX="{{ .Data.data.matrix }}" export CHAT_KEY_HOMEPAGE="{{ .Data.data.homepage }}" + export AI_ATLASBOT_TOKEN="{{ .Data.data.homepage }}" {{ end }} {{ with secret "kv/data/atlas/shared/portal-e2e-client" }} export PORTAL_E2E_CLIENT_ID="{{ .Data.data.client_id }}" @@ -66,6 +67,10 @@ spec: value: qwen2.5-coder:7b-instruct-q4_0 - name: AI_CHAT_TIMEOUT_SEC value: "480" + - name: AI_ATLASBOT_ENDPOINT + value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer + - name: AI_ATLASBOT_TIMEOUT_SEC + value: "5" - name: AI_NODE_NAME valueFrom: fieldRef: diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d195e890..c0596b67 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-24 + checksum/atlasbot-configmap: manual-atlasbot-25 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -87,6 +87,11 @@ spec: value: "480" - name: ATLASBOT_THINKING_INTERVAL_SEC value: "120" + - name: ATLASBOT_HTTP_PORT + value: "8090" + ports: + - name: http + containerPort: 8090 resources: requests: cpu: 100m diff --git a/services/comms/atlasbot-service.yaml b/services/comms/atlasbot-service.yaml new file mode 100644 index 00000000..c8b35705 --- /dev/null +++ b/services/comms/atlasbot-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: atlasbot + namespace: comms + labels: + app: atlasbot +spec: + selector: + app: atlasbot + ports: + - name: http + port: 8090 + targetPort: 8090 + type: ClusterIP diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml index 37f681de..410f2a69 100644 --- a/services/comms/kustomization.yaml +++ b/services/comms/kustomization.yaml @@ -14,6 +14,7 @@ resources: - guest-register-deployment.yaml - guest-register-service.yaml - atlasbot-deployment.yaml + - atlasbot-service.yaml - wellknown.yaml - atlasbot-rbac.yaml - mas-secrets-ensure-rbac.yaml diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 987df7a1..deb8e62c 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -5,6 +5,7 @@ import re import ssl import threading import time +from http.server import BaseHTTPRequestHandler, HTTPServer from typing import Any from urllib import error, parse, request @@ -1089,6 +1090,62 @@ def _normalize_reply(value: Any) -> str: return text +# Internal HTTP endpoint for cluster answers (website uses this). +class _AtlasbotHandler(BaseHTTPRequestHandler): + server_version = "AtlasbotHTTP/1.0" + + def _write_json(self, status: int, payload: dict[str, Any]): + body = json.dumps(payload).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _authorized(self) -> bool: + if not ATLASBOT_INTERNAL_TOKEN: + return True + token = self.headers.get("X-Internal-Token", "") + return token == ATLASBOT_INTERNAL_TOKEN + + def do_GET(self): # noqa: N802 + if self.path == "/health": + self._write_json(200, {"status": "ok"}) + return + self._write_json(404, {"error": "not_found"}) + + def do_POST(self): # noqa: N802 + if self.path != "/v1/answer": + self._write_json(404, {"error": "not_found"}) + return + if not self._authorized(): + self._write_json(401, {"error": "unauthorized"}) + return + try: + length = int(self.headers.get("Content-Length", "0")) + except ValueError: + length = 0 + raw = self.rfile.read(length) if length > 0 else b"" + try: + payload = json.loads(raw.decode("utf-8")) if raw else {} + except json.JSONDecodeError: + self._write_json(400, {"error": "invalid_json"}) + return + prompt = str(payload.get("prompt") or payload.get("question") or "").strip() + if not prompt: + self._write_json(400, {"error": "missing_prompt"}) + return + inventory = node_inventory_live() + answer = structured_answer(prompt, inventory=inventory, metrics_summary="") + self._write_json(200, {"answer": answer}) + + +def _start_http_server(): + server = HTTPServer(("0.0.0.0", ATLASBOT_HTTP_PORT), _AtlasbotHandler) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + + # Conversation state. history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] (short transcript) @@ -1326,6 +1383,7 @@ def login_with_retry(): def main(): load_kb() + _start_http_server() token = login_with_retry() try: room_id = resolve_alias(token, ROOM_ALIAS) From efe7b9bc5fbd7b7a4dadbf1869323caa862b44e5 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 01:47:43 +0000 Subject: [PATCH 266/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 90c3b8de..fe604b6c 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 9abfcffd8016540a290bdb7d783d5e75696a49e3 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 01:47:47 +0000 Subject: [PATCH 267/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index fe604b6c..f50c38b0 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 5393585f3e81423e889deac602b36ae78a2fbb1c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:50:12 -0300 Subject: [PATCH 268/416] monitoring: fix jetson metrics newlines --- services/monitoring/scripts/jetson_tegrastats_exporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index c4d3fa2a..c237ec5d 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -65,7 +65,7 @@ class Handler(http.server.BaseHTTPRequestHandler): for k, v in metrics.items(): out.append(f"# TYPE jetson_{k} gauge") out.append(f"jetson_{k}{label} {v}") - body = "\\n".join(out) + "\\n" + body = "\n".join(out) + "\n" self.send_response(200) self.send_header("Content-Type", "text/plain; version=0.0.4") self.send_header("Content-Length", str(len(body))) From e21bc8ef40166c319cdcf3be1d2b0f23154aa7b5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:51:04 -0300 Subject: [PATCH 269/416] atlasbot: prioritize top queries over list --- services/comms/scripts/atlasbot/bot.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index deb8e62c..e6c7542b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -335,7 +335,11 @@ def _has_any(text: str, phrases: tuple[str, ...]) -> bool: return any(p in text for p in phrases) def _detect_operation(q: str) -> str | None: + if _has_any(q, OPERATION_HINTS["top"]): + return "top" for op, phrases in OPERATION_HINTS.items(): + if op == "top": + continue if _has_any(q, phrases): return op return None From d5478e272ece666a4d5d3fba50190ba53dba4cf5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:51:41 -0300 Subject: [PATCH 270/416] monitoring: restart jetson exporter --- services/monitoring/jetson-tegrastats-exporter.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index a8354014..8584ebaa 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,6 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" + monitoring.bstein.dev/restart-rev: "1" spec: serviceAccountName: default hostPID: true From 3340b5bf9d61e8eaeb006d214606909f2180af40 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:52:49 -0300 Subject: [PATCH 271/416] comms: restart atlasbot for op priority --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index c0596b67..3ebb8610 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-25 + checksum/atlasbot-configmap: manual-atlasbot-26 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 8587da0e372672c654d38e13da13540d7cb515b6 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 22:54:43 -0300 Subject: [PATCH 272/416] comms: rerun synapse user seed --- services/comms/synapse-user-seed-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/synapse-user-seed-job.yaml b/services/comms/synapse-user-seed-job.yaml index 7fef796e..aab88c3b 100644 --- a/services/comms/synapse-user-seed-job.yaml +++ b/services/comms/synapse-user-seed-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: synapse-user-seed-7 + name: synapse-user-seed-8 namespace: comms spec: backoffLimit: 1 From c6d811e29d9701db44c2212d2eea0b81c7363cbd Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 02:52:49 +0000 Subject: [PATCH 273/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index f50c38b0..d6208c42 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From a9b6729eb2282f6a3935dea74f2b2f1ef8699c16 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 02:53:50 +0000 Subject: [PATCH 274/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index d6208c42..a520991b 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 1b6fac86fbe46c5228d8162788fb88583949b4a7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 01:04:38 -0300 Subject: [PATCH 275/416] vault: bootstrap k8s auth config with root token --- services/vault/k8s-auth-config-cronjob.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml index 43da16b4..5a2d6829 100644 --- a/services/vault/k8s-auth-config-cronjob.yaml +++ b/services/vault/k8s-auth-config-cronjob.yaml @@ -34,6 +34,11 @@ spec: value: http://10.43.57.249:8200 - name: VAULT_K8S_ROLE value: vault-admin + - name: VAULT_TOKEN + valueFrom: + secretKeyRef: + name: vault-init + key: root_token - name: VAULT_K8S_TOKEN_REVIEWER_JWT_FILE value: /var/run/secrets/vault-token-reviewer/token - name: VAULT_K8S_ROLE_TTL From e622b1ae09f7fa06593880208ad4c5f57cdc9109 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 01:14:42 -0300 Subject: [PATCH 276/416] comms: rerun ensure jobs and fix vault oidc env --- services/comms/comms-secrets-ensure-job.yaml | 2 +- services/comms/mas-local-users-ensure-job.yaml | 2 +- services/maintenance/ariadne-deployment.yaml | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/services/comms/comms-secrets-ensure-job.yaml b/services/comms/comms-secrets-ensure-job.yaml index b71dd403..52904cc9 100644 --- a/services/comms/comms-secrets-ensure-job.yaml +++ b/services/comms/comms-secrets-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: comms-secrets-ensure-6 + name: comms-secrets-ensure-7 namespace: comms spec: backoffLimit: 1 diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/mas-local-users-ensure-job.yaml index c8cf5f04..d385b473 100644 --- a/services/comms/mas-local-users-ensure-job.yaml +++ b/services/comms/mas-local-users-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: mas-local-users-ensure-16 + name: mas-local-users-ensure-17 namespace: comms spec: backoffLimit: 1 diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 33b8a12a..6fa638d3 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -89,7 +89,11 @@ spec: export VAULT_OIDC_USER_POLICIES="{{ .Data.data.user_policies }}" export VAULT_OIDC_REDIRECT_URIS="{{ .Data.data.redirect_uris }}" export VAULT_OIDC_BOUND_AUDIENCES="{{ .Data.data.bound_audiences }}" + {{- if .Data.data.bound_claims_type }} export VAULT_OIDC_BOUND_CLAIMS_TYPE="{{ .Data.data.bound_claims_type }}" + {{- else }} + export VAULT_OIDC_BOUND_CLAIMS_TYPE="string" + {{- end }} {{ end }} spec: serviceAccountName: ariadne From b2a464b80a9172a5affcca8f27f80956fc85e5a2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 01:19:43 -0300 Subject: [PATCH 277/416] comms: rerun mas local user ensure --- services/comms/mas-local-users-ensure-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/mas-local-users-ensure-job.yaml index d385b473..636ee5bb 100644 --- a/services/comms/mas-local-users-ensure-job.yaml +++ b/services/comms/mas-local-users-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: mas-local-users-ensure-17 + name: mas-local-users-ensure-18 namespace: comms spec: backoffLimit: 1 From 2d996ffd6ec916d2fcdd806d12248e08e92207da Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 01:22:02 -0300 Subject: [PATCH 278/416] comms: rerun synapse seeder admin ensure --- services/comms/synapse-seeder-admin-ensure-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/synapse-seeder-admin-ensure-job.yaml index ce8ccd35..5d2d4225 100644 --- a/services/comms/synapse-seeder-admin-ensure-job.yaml +++ b/services/comms/synapse-seeder-admin-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: synapse-seeder-admin-ensure-8 + name: synapse-seeder-admin-ensure-9 namespace: comms spec: backoffLimit: 2 From 3579e906b43e9484389ab52b6fc90c99e7e83ebf Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 06:51:28 +0000 Subject: [PATCH 279/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 7528f6f3..c8f9f2c8 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-51 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-54 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From ad6c927370a966b0aedafc3f045c4735c8721a69 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 03:56:00 -0300 Subject: [PATCH 280/416] atlasbot: clarify scoped metrics and format percent values --- services/comms/scripts/atlasbot/bot.py | 57 ++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e6c7542b..f8b3ccff 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -406,15 +406,56 @@ def _apply_node_filter(expr: str, node_regex: str | None) -> str: replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}' return expr.replace(needle, replacement) +def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool: + exprs = entry.get("exprs") + expr = exprs[0] if isinstance(exprs, list) and exprs else "" + return "* 100" in expr or "*100" in expr + + +def _format_metric_value(value: str, *, percent: bool) -> str: + try: + num = float(value) + except (TypeError, ValueError): + return value + if percent: + return f"{num:.1f}%" + if abs(num) >= 1: + return f"{num:.2f}".rstrip("0").rstrip(".") + return f"{num:.4f}".rstrip("0").rstrip(".") + + +def _format_metric_label(metric: dict[str, Any]) -> str: + label_parts = [] + for k in ("namespace", "pod", "container", "node", "instance", "job", "phase"): + if metric.get(k): + label_parts.append(f"{k}={metric.get(k)}") + if not label_parts: + for k in sorted(metric.keys()): + if k.startswith("__"): + continue + label_parts.append(f"{k}={metric.get(k)}") + if len(label_parts) >= 4: + break + return ", ".join(label_parts) if label_parts else "series" + + def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str: series = _vm_value_series(res) panel = entry.get("panel_title") or "Metric" if not series: return "" - rendered = vm_render_result(res, limit=5) - if not rendered: + percent = _metric_expr_uses_percent(entry) + lines: list[str] = [] + for r in series[:5]: + if not isinstance(r, dict): + continue + metric = r.get("metric") or {} + value = r.get("value") or [] + val = value[1] if isinstance(value, list) and len(value) > 1 else "" + label = _format_metric_label(metric if isinstance(metric, dict) else {}) + lines.append(f"{label}: {_format_metric_value(val, percent=percent)}") + if not lines: return "" - lines = [line.lstrip("-").strip() for line in rendered.splitlines() if line.strip().startswith("-")] if len(lines) == 1: return f"{panel}: {lines[0]}." return f"{panel}:\n" + "\n".join(f"- {line}" for line in lines) @@ -627,6 +668,16 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s res = vm_query(expr, timeout=20) answer = _format_metric_answer(entry, res) if answer: + scope_parts: list[str] = [] + if include_hw: + scope_parts.append(" and ".join(sorted(include_hw))) + if exclude_hw: + scope_parts.append(f"excluding {' and '.join(sorted(exclude_hw))}") + if only_workers: + scope_parts.append("worker") + if scope_parts: + scope = " ".join(scope_parts) + return f"Among {scope} nodes, {answer}" return answer if metrics_summary: return metrics_summary From 16a059134ae2b10ed237f929596c33c5af50b6bd Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 03:56:47 -0300 Subject: [PATCH 281/416] comms: restart atlasbot for metrics formatting --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 3ebb8610..83e0b2ed 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-26 + checksum/atlasbot-configmap: manual-atlasbot-27 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 00c0375790148e4e2c41e885eedf854578ddef29 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:48:44 -0300 Subject: [PATCH 282/416] comms: add synapse admin ensure job --- services/comms/kustomization.yaml | 1 + services/comms/synapse-admin-ensure-job.yaml | 177 ++++++++++++++++++ services/maintenance/ariadne-deployment.yaml | 3 + .../vault/scripts/vault_k8s_auth_configure.sh | 4 +- 4 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 services/comms/synapse-admin-ensure-job.yaml diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml index 410f2a69..01d7be5c 100644 --- a/services/comms/kustomization.yaml +++ b/services/comms/kustomization.yaml @@ -25,6 +25,7 @@ resources: - mas-admin-client-secret-ensure-job.yaml - mas-db-ensure-job.yaml - comms-secrets-ensure-job.yaml + - synapse-admin-ensure-job.yaml - synapse-signingkey-ensure-job.yaml - synapse-seeder-admin-ensure-job.yaml - synapse-user-seed-job.yaml diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/synapse-admin-ensure-job.yaml new file mode 100644 index 00000000..be9e0fd1 --- /dev/null +++ b/services/comms/synapse-admin-ensure-job.yaml @@ -0,0 +1,177 @@ +# services/comms/synapse-admin-ensure-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: synapse-admin-ensure-1 + namespace: comms +spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 3600 + template: + spec: + serviceAccountName: comms-secrets-ensure + restartPolicy: OnFailure + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: kubernetes.io/arch + operator: In + values: ["arm64"] + containers: + - name: ensure + image: python:3.11-slim + env: + - name: VAULT_ADDR + value: http://vault.vault.svc.cluster.local:8200 + - name: VAULT_ROLE + value: comms-secrets + - name: SYNAPSE_ADMIN_URL + value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008 + command: + - /bin/sh + - -c + - | + set -euo pipefail + python - <<'PY' + import base64 + import hashlib + import hmac + import json + import os + import secrets + import string + import urllib.error + import urllib.request + + VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/") + VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets") + SYNAPSE_ADMIN_URL = os.environ.get( + "SYNAPSE_ADMIN_URL", + "http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008", + ).rstrip("/") + SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" + + def log(msg: str) -> None: + print(msg, flush=True) + + def request_json(url: str, payload: dict | None = None) -> dict: + data = None + headers = {"Content-Type": "application/json"} + if payload is not None: + data = json.dumps(payload).encode("utf-8") + req = urllib.request.Request(url, data=data, headers=headers, method="POST" if data else "GET") + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read().decode("utf-8")) + + def vault_login() -> str: + with open(SA_TOKEN_PATH, "r", encoding="utf-8") as f: + jwt = f.read().strip() + payload = {"jwt": jwt, "role": VAULT_ROLE} + resp = request_json(f"{VAULT_ADDR}/v1/auth/kubernetes/login", payload) + token = resp.get("auth", {}).get("client_token") + if not token: + raise RuntimeError("vault login failed") + return token + + def vault_get(token: str, path: str) -> dict: + req = urllib.request.Request( + f"{VAULT_ADDR}/v1/kv/data/atlas/{path}", + headers={"X-Vault-Token": token}, + ) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + payload = json.loads(resp.read().decode("utf-8")) + return payload.get("data", {}).get("data", {}) + except urllib.error.HTTPError as exc: + if exc.code == 404: + return {} + raise + + def vault_put(token: str, path: str, data: dict) -> None: + payload = {"data": data} + req = urllib.request.Request( + f"{VAULT_ADDR}/v1/kv/data/atlas/{path}", + data=json.dumps(payload).encode("utf-8"), + headers={"X-Vault-Token": token, "Content-Type": "application/json"}, + method="POST", + ) + with urllib.request.urlopen(req, timeout=30) as resp: + resp.read() + + def random_password(length: int = 32) -> str: + alphabet = string.ascii_letters + string.digits + return "".join(secrets.choice(alphabet) for _ in range(length)) + + def ensure_registration_secret(token: str) -> str: + data = vault_get(token, "comms/synapse-registration") + secret = (data.get("registration_shared_secret") or "").strip() + if not secret: + secret = secrets.token_urlsafe(32) + data["registration_shared_secret"] = secret + vault_put(token, "comms/synapse-registration", data) + log("registration secret created") + return secret + + def ensure_admin_creds(token: str) -> dict: + data = vault_get(token, "comms/synapse-admin") + username = (data.get("username") or "").strip() or "synapse-admin" + password = (data.get("password") or "").strip() + if not password: + password = random_password() + data["username"] = username + data["password"] = password + vault_put(token, "comms/synapse-admin", data) + return data + + def register_admin(secret: str, username: str, password: str) -> str: + nonce_payload = request_json(f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register") + nonce = nonce_payload.get("nonce") + if not nonce: + raise RuntimeError("synapse register nonce missing") + admin_flag = "admin" + user_type = "" + mac_payload = "\x00".join([nonce, username, password, admin_flag, user_type]) + mac = hmac.new(secret.encode("utf-8"), mac_payload.encode("utf-8"), hashlib.sha1).hexdigest() + payload = { + "nonce": nonce, + "username": username, + "password": password, + "admin": True, + "mac": mac, + } + req = urllib.request.Request( + f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register", + data=json.dumps(payload).encode("utf-8"), + headers={"Content-Type": "application/json"}, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + payload = json.loads(resp.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + body = exc.read().decode("utf-8") + raise RuntimeError(f"synapse admin register failed: {exc.code} {body}") from exc + access_token = payload.get("access_token") + if not access_token: + raise RuntimeError("synapse admin token missing") + return access_token + + vault_token = vault_login() + reg_secret = ensure_registration_secret(vault_token) + admin_data = ensure_admin_creds(vault_token) + if admin_data.get("access_token"): + log("synapse admin token already present") + raise SystemExit(0) + access_token = register_admin(reg_secret, admin_data["username"], admin_data["password"]) + admin_data["access_token"] = access_token + vault_put(vault_token, "comms/synapse-admin", admin_data) + log("synapse admin user ensured") + PY diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 6fa638d3..fce1ded5 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -69,6 +69,9 @@ spec: export COMMS_BOT_PASSWORD="{{ index .Data.data "bot-password" }}" export COMMS_SEEDER_PASSWORD="{{ index .Data.data "seeder-password" }}" {{ end }} + {{ with secret "kv/data/atlas/comms/synapse-admin" }} + export COMMS_SYNAPSE_ADMIN_TOKEN="{{ .Data.data.access_token }}" + {{ end }} {{ with secret "kv/data/atlas/comms/synapse-db" }} export COMMS_SYNAPSE_DB_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}" {{ end }} diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index 21132c79..0212180f 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \ write_policy_and_role "health" "health" "health-vault-sync" \ "health/*" "" write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \ - "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" "" + "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" "" write_policy_and_role "finance" "finance" "finance-vault" \ "finance/* shared/postmark-relay" "" write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \ @@ -253,4 +253,4 @@ write_policy_and_role "crypto-secrets" "crypto" "crypto-secrets-ensure" \ write_policy_and_role "comms-secrets" "comms" \ "comms-secrets-ensure,mas-db-ensure,mas-admin-client-secret-writer,othrys-synapse-signingkey-job" \ "" \ - "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon comms/atlasbot-credentials-runtime comms/synapse-db comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey" + "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin comms/synapse-registration comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey" From 64e59a9b77a0043d5e76f923959d244411eb67e3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:51:20 -0300 Subject: [PATCH 283/416] atlasbot: add knowledge summaries and better fallback --- services/comms/scripts/atlasbot/bot.py | 110 +++++++++++++++++++++++-- 1 file changed, 103 insertions(+), 7 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f8b3ccff..3a1a0002 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -254,14 +254,14 @@ def load_kb(): _NAME_INDEX = names _METRIC_INDEX = metrics if isinstance(metrics, list) else [] -def kb_retrieve(query: str, *, limit: int = 3) -> str: +def _score_kb_docs(query: str) -> list[dict[str, Any]]: q = (query or "").strip() if not q or not KB.get("runbooks"): - return "" + return [] ql = q.lower() q_tokens = _tokens(q) if not q_tokens: - return "" + return [] scored: list[tuple[int, dict]] = [] for doc in KB.get("runbooks", []): @@ -281,9 +281,16 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: score += 4 if score: scored.append((score, doc)) - scored.sort(key=lambda x: x[0], reverse=True) - picked = [d for _, d in scored[:limit]] + return [d for _, d in scored] + + +def kb_retrieve(query: str, *, limit: int = 3) -> str: + q = (query or "").strip() + if not q: + return "" + scored = _score_kb_docs(q) + picked = scored[:limit] if not picked: return "" @@ -301,6 +308,22 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: used += len(chunk) return "\n".join(parts).strip() + +def kb_retrieve_titles(query: str, *, limit: int = 4) -> str: + scored = _score_kb_docs(query) + picked = scored[:limit] + if not picked: + return "" + parts = ["Relevant runbooks:"] + for doc in picked: + title = doc.get("title") or doc.get("path") or "runbook" + path = doc.get("path") or "" + if path: + parts.append(f"- {title} ({path})") + else: + parts.append(f"- {title}") + return "\n".join(parts) + def _extract_titan_nodes(text: str) -> list[str]: cleaned = normalize_query(text) names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n} @@ -439,6 +462,18 @@ def _format_metric_label(metric: dict[str, Any]) -> str: return ", ".join(label_parts) if label_parts else "series" +def _primary_series_metric(res: dict | None) -> tuple[str | None, str | None]: + series = _vm_value_series(res or {}) + if not series: + return (None, None) + first = series[0] + metric = first.get("metric") if isinstance(first, dict) else {} + value = first.get("value") if isinstance(first, dict) else [] + node = metric.get("node") if isinstance(metric, dict) else None + val = value[1] if isinstance(value, list) and len(value) > 1 else None + return (node, val) + + def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str: series = _vm_value_series(res) panel = entry.get("panel_title") or "Metric" @@ -677,7 +712,15 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s scope_parts.append("worker") if scope_parts: scope = " ".join(scope_parts) - return f"Among {scope} nodes, {answer}" + overall_note = "" + base_res = vm_query(entry["exprs"][0], timeout=20) + base_node, base_val = _primary_series_metric(base_res) + scoped_node, scoped_val = _primary_series_metric(res) + if base_node and scoped_node and base_node != scoped_node: + percent = _metric_expr_uses_percent(entry) + base_val_fmt = _format_metric_value(base_val or "", percent=percent) + overall_note = f" Overall hottest node: {base_node} ({base_val_fmt})." + return f"Among {scope} nodes, {answer}{overall_note}" return answer if metrics_summary: return metrics_summary @@ -1075,7 +1118,7 @@ def _context_fallback(context: str) -> str: trimmed = context.strip() if len(trimmed) > MAX_TOOL_CHARS: trimmed = trimmed[: MAX_TOOL_CHARS - 3].rstrip() + "..." - return "I couldn’t reach the model backend. Here is the data I found:\n" + trimmed + return "Here is what I found:\n" + trimmed def vm_top_restarts(hours: int = 1) -> str: q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))" @@ -1192,6 +1235,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): return inventory = node_inventory_live() answer = structured_answer(prompt, inventory=inventory, metrics_summary="") + if not answer and _knowledge_intent(prompt): + answer = knowledge_summary(prompt, inventory) + if not answer: + kb = kb_retrieve_titles(prompt, limit=4) + answer = kb or "" self._write_json(200, {"answer": answer}) @@ -1257,6 +1305,48 @@ def build_context( return "\n\n".join([p for p in parts if p]).strip() + +def _knowledge_intent(prompt: str) -> bool: + q = normalize_query(prompt) + return any( + phrase in q + for phrase in ( + "what do you know", + "tell me about", + "overview", + "summary", + "describe", + "explain", + "what is", + ) + ) + + +def _inventory_summary(inventory: list[dict[str, Any]]) -> str: + if not inventory: + return "" + groups = _group_nodes(inventory) + total = len(inventory) + ready = [n for n in inventory if n.get("ready") is True] + not_ready = [n for n in inventory if n.get("ready") is False] + parts = [f"Atlas cluster: {total} nodes ({len(ready)} ready, {len(not_ready)} not ready)."] + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + nodes = groups.get(key) or [] + if nodes: + parts.append(f"- {key}: {len(nodes)} nodes ({', '.join(nodes)})") + return "\n".join(parts) + + +def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str: + parts: list[str] = [] + inv = _inventory_summary(inventory) + if inv: + parts.append(inv) + kb_titles = kb_retrieve_titles(prompt, limit=4) + if kb_titles: + parts.append(kb_titles) + return "\n".join(parts).strip() + def _ollama_call(hist_key, prompt: str, *, context: str) -> str: system = ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " @@ -1416,6 +1506,12 @@ def sync_loop(token: str, room_id: str): send_msg(token, rid, structured) continue + if _knowledge_intent(body): + summary = knowledge_summary(body, inventory) + if summary: + send_msg(token, rid, summary) + continue + reply = ollama_reply_with_thinking( token, rid, From 7c14fe7b3c0541d4223b98da47087f08e97b9d4f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:51:33 -0300 Subject: [PATCH 284/416] comms: restart atlasbot for knowledge summaries --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 83e0b2ed..5198f2a9 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-27 + checksum/atlasbot-configmap: manual-atlasbot-28 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 89949835d95a6839f850a71eef9e524cea4c7ee3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:53:33 -0300 Subject: [PATCH 285/416] atlasbot: scope overall hottest node to atlas inventory --- services/comms/scripts/atlasbot/bot.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 3a1a0002..8df1317d 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -713,7 +713,12 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s if scope_parts: scope = " ".join(scope_parts) overall_note = "" - base_res = vm_query(entry["exprs"][0], timeout=20) + base_expr = entry["exprs"][0] + if inventory: + all_nodes = "|".join([n["name"] for n in inventory]) + if all_nodes: + base_expr = _apply_node_filter(base_expr, all_nodes) + base_res = vm_query(base_expr, timeout=20) base_node, base_val = _primary_series_metric(base_res) scoped_node, scoped_val = _primary_series_metric(res) if base_node and scoped_node and base_node != scoped_node: From 48b5b018cad32af9b1439a3d597c58e8a2a617d9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:53:44 -0300 Subject: [PATCH 286/416] comms: restart atlasbot for scoped hottest --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 5198f2a9..e35fa619 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-28 + checksum/atlasbot-configmap: manual-atlasbot-29 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 4083c3dcfafea08ce4d6402d3cb62244976a4249 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 04:58:13 -0300 Subject: [PATCH 287/416] comms: ensure synapse admin token --- services/comms/synapse-admin-ensure-job.yaml | 141 ++++++++++++------- 1 file changed, 89 insertions(+), 52 deletions(-) diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/synapse-admin-ensure-job.yaml index be9e0fd1..6ddea830 100644 --- a/services/comms/synapse-admin-ensure-job.yaml +++ b/services/comms/synapse-admin-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: synapse-admin-ensure-1 + name: synapse-admin-ensure-2 namespace: comms spec: backoffLimit: 1 @@ -40,24 +40,26 @@ spec: - -c - | set -euo pipefail + pip install --no-cache-dir psycopg2-binary bcrypt >/dev/null python - <<'PY' - import base64 - import hashlib - import hmac import json import os import secrets import string + import time import urllib.error import urllib.request + import bcrypt + import psycopg2 + VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/") VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets") - SYNAPSE_ADMIN_URL = os.environ.get( - "SYNAPSE_ADMIN_URL", - "http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008", - ).rstrip("/") SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token" + PGHOST = "postgres-service.postgres.svc.cluster.local" + PGPORT = 5432 + PGDATABASE = "synapse" + PGUSER = "synapse" def log(msg: str) -> None: print(msg, flush=True) @@ -110,16 +112,6 @@ spec: alphabet = string.ascii_letters + string.digits return "".join(secrets.choice(alphabet) for _ in range(length)) - def ensure_registration_secret(token: str) -> str: - data = vault_get(token, "comms/synapse-registration") - secret = (data.get("registration_shared_secret") or "").strip() - if not secret: - secret = secrets.token_urlsafe(32) - data["registration_shared_secret"] = secret - vault_put(token, "comms/synapse-registration", data) - log("registration secret created") - return secret - def ensure_admin_creds(token: str) -> dict: data = vault_get(token, "comms/synapse-admin") username = (data.get("username") or "").strip() or "synapse-admin" @@ -131,47 +123,92 @@ spec: vault_put(token, "comms/synapse-admin", data) return data - def register_admin(secret: str, username: str, password: str) -> str: - nonce_payload = request_json(f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register") - nonce = nonce_payload.get("nonce") - if not nonce: - raise RuntimeError("synapse register nonce missing") - admin_flag = "admin" - user_type = "" - mac_payload = "\x00".join([nonce, username, password, admin_flag, user_type]) - mac = hmac.new(secret.encode("utf-8"), mac_payload.encode("utf-8"), hashlib.sha1).hexdigest() - payload = { - "nonce": nonce, - "username": username, - "password": password, - "admin": True, - "mac": mac, + def ensure_user(cur, cols, user_id, password, admin): + now_ms = int(time.time() * 1000) + values = { + "name": user_id, + "password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(), + "creation_ts": now_ms, } - req = urllib.request.Request( - f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register", - data=json.dumps(payload).encode("utf-8"), - headers={"Content-Type": "application/json"}, - method="POST", + + def add_flag(name, flag): + if name not in cols: + return + if cols[name]["type"] in ("smallint", "integer"): + values[name] = int(flag) + else: + values[name] = bool(flag) + + add_flag("admin", admin) + add_flag("deactivated", False) + add_flag("shadow_banned", False) + add_flag("is_guest", False) + + columns = list(values.keys()) + placeholders = ", ".join(["%s"] * len(columns)) + updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"]) + query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};" + cur.execute(query, [values[c] for c in columns]) + + def get_cols(cur): + cur.execute( + """ + SELECT column_name, is_nullable, column_default, data_type + FROM information_schema.columns + WHERE table_schema = 'public' AND table_name = 'users' + """ + ) + cols = {} + for name, is_nullable, default, data_type in cur.fetchall(): + cols[name] = { + "nullable": is_nullable == "YES", + "default": default, + "type": data_type, + } + return cols + + def ensure_access_token(cur, user_id, token_value): + cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens") + token_id = cur.fetchone()[0] + cur.execute( + """ + INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms) + VALUES (%s, %s, %s, %s, NULL) + ON CONFLICT (token) DO NOTHING + """, + (token_id, user_id, token_value, "ariadne-admin"), ) - try: - with urllib.request.urlopen(req, timeout=30) as resp: - payload = json.loads(resp.read().decode("utf-8")) - except urllib.error.HTTPError as exc: - body = exc.read().decode("utf-8") - raise RuntimeError(f"synapse admin register failed: {exc.code} {body}") from exc - access_token = payload.get("access_token") - if not access_token: - raise RuntimeError("synapse admin token missing") - return access_token vault_token = vault_login() - reg_secret = ensure_registration_secret(vault_token) admin_data = ensure_admin_creds(vault_token) if admin_data.get("access_token"): log("synapse admin token already present") raise SystemExit(0) - access_token = register_admin(reg_secret, admin_data["username"], admin_data["password"]) - admin_data["access_token"] = access_token + + synapse_db = vault_get(vault_token, "comms/synapse-db") + pg_password = synapse_db.get("POSTGRES_PASSWORD") + if not pg_password: + raise RuntimeError("synapse db password missing") + + user_id = f"@{admin_data['username']}:live.bstein.dev" + conn = psycopg2.connect( + host=PGHOST, + port=PGPORT, + dbname=PGDATABASE, + user=PGUSER, + password=pg_password, + ) + token_value = secrets.token_urlsafe(32) + try: + with conn: + with conn.cursor() as cur: + cols = get_cols(cur) + ensure_user(cur, cols, user_id, admin_data["password"], True) + ensure_access_token(cur, user_id, token_value) + finally: + conn.close() + + admin_data["access_token"] = token_value vault_put(vault_token, "comms/synapse-admin", admin_data) - log("synapse admin user ensured") + log("synapse admin token stored") PY From 47f049d39260aefbf2b9fff73e651618ac997414 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 05:02:02 -0300 Subject: [PATCH 288/416] comms: retain synapse admin ensure logs --- services/comms/synapse-admin-ensure-job.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/synapse-admin-ensure-job.yaml index 6ddea830..5ddf60c4 100644 --- a/services/comms/synapse-admin-ensure-job.yaml +++ b/services/comms/synapse-admin-ensure-job.yaml @@ -2,15 +2,15 @@ apiVersion: batch/v1 kind: Job metadata: - name: synapse-admin-ensure-2 + name: synapse-admin-ensure-3 namespace: comms spec: - backoffLimit: 1 + backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: spec: serviceAccountName: comms-secrets-ensure - restartPolicy: OnFailure + restartPolicy: Never affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -40,7 +40,7 @@ spec: - -c - | set -euo pipefail - pip install --no-cache-dir psycopg2-binary bcrypt >/dev/null + pip install --no-cache-dir psycopg2-binary bcrypt python - <<'PY' import json import os From aed70963cc54514c8b3a9739fbaea86515e0a602 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 08:14:36 +0000 Subject: [PATCH 289/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index c8f9f2c8..1392855b 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-54 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-56 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From ef7946b4f27523f8ae638a7c26c65b18773cf608 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 05:41:58 -0300 Subject: [PATCH 290/416] atlasbot: use cluster snapshot + model update --- services/ai-llm/deployment.yaml | 4 +- services/comms/atlasbot-deployment.yaml | 6 +- services/comms/scripts/atlasbot/bot.py | 368 +++++++++++++++++++++--- 3 files changed, 334 insertions(+), 44 deletions(-) diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml index 4f34d866..43d14c81 100644 --- a/services/ai-llm/deployment.yaml +++ b/services/ai-llm/deployment.yaml @@ -20,7 +20,7 @@ spec: labels: app: ollama annotations: - ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0 + ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0 ai.bstein.dev/gpu: GPU pool (titan-22/24) ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z" spec: @@ -52,7 +52,7 @@ spec: - name: OLLAMA_MODELS value: /root/.ollama - name: OLLAMA_MODEL - value: qwen2.5-coder:7b-instruct-q4_0 + value: qwen2.5:7b-instruct-q4_0 command: - /bin/sh - -c diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index e35fa619..0ee86f01 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -82,11 +82,13 @@ spec: - name: OLLAMA_URL value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL - value: qwen2.5-coder:7b-instruct-q4_0 + value: qwen2.5:7b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC - value: "480" + value: "600" - name: ATLASBOT_THINKING_INTERVAL_SEC value: "120" + - name: ATLASBOT_SNAPSHOT_TTL_SEC + value: "30" - name: ATLASBOT_HTTP_PORT value: "8090" ports: diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 8df1317d..9f6c38dc 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -21,6 +21,7 @@ API_KEY = os.environ.get("CHAT_API_KEY", "") OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090")) ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "") +SNAPSHOT_TTL_SEC = int(os.environ.get("ATLASBOT_SNAPSHOT_TTL_SEC", "30")) KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") @@ -523,7 +524,7 @@ def _hardware_match(node: dict[str, Any], filters: set[str]) -> bool: hw = node.get("hardware") or "" arch = node.get("arch") or "" for f in filters: - if f == "rpi" and hw in ("rpi4", "rpi5"): + if f == "rpi" and hw in ("rpi4", "rpi5", "rpi"): return True if f == "arm64" and arch == "arm64": return True @@ -546,7 +547,7 @@ def _hardware_class(labels: dict[str, Any]) -> str: if str(labels.get("jetson") or "").lower() == "true": return "jetson" hardware = (labels.get("hardware") or "").strip().lower() - if hardware in ("rpi4", "rpi5"): + if hardware in ("rpi4", "rpi5", "rpi"): return hardware arch = labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "" if arch == "amd64": @@ -580,6 +581,14 @@ def node_inventory_live() -> list[dict[str, Any]]: ) return sorted(inventory, key=lambda item: item["name"]) + +def node_inventory() -> list[dict[str, Any]]: + snapshot = _snapshot_state() + inventory = _snapshot_inventory(snapshot) + if inventory: + return inventory + return node_inventory_live() + def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: grouped: dict[str, list[str]] = collections.defaultdict(list) for node in inventory: @@ -591,7 +600,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")): return "" if inventory is None: - inventory = node_inventory_live() + inventory = node_inventory() if not inventory: return "" groups = _group_nodes(inventory) @@ -626,7 +635,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]: q = normalize_query(prompt) if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")): - return node_inventory_live() + return node_inventory() return [] def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: @@ -656,11 +665,177 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: "expected_missing": sorted(expected_missing), } -def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str: + +def _workload_tokens(entry: dict[str, Any]) -> set[str]: + tokens: set[str] = set() + for key in ("workload", "namespace"): + value = entry.get(key) + if isinstance(value, str) and value: + tokens.update(_tokens(value)) + return tokens + + +def _select_workload(prompt: str, workloads: list[dict[str, Any]]) -> dict[str, Any] | None: + q_tokens = set(_tokens(prompt)) + if not q_tokens: + return None + scored: list[tuple[int, dict[str, Any]]] = [] + for entry in workloads: + if not isinstance(entry, dict): + continue + tokens = _workload_tokens(entry) + score = len(tokens & q_tokens) + if score: + scored.append((score, entry)) + if not scored: + return None + scored.sort(key=lambda item: item[0], reverse=True) + return scored[0][1] + + +def _format_confidence(answer: str, confidence: str) -> str: + if not answer: + return "" + return f"{answer}\nConfidence: {confidence}." + + +def workload_answer(prompt: str, workloads: list[dict[str, Any]]) -> str: + q = normalize_query(prompt) + if not any(word in q for word in ("where", "which", "node", "run", "running", "host", "located")): + return "" + entry = _select_workload(prompt, workloads) + if not entry: + return "" + workload = entry.get("workload") or "" + namespace = entry.get("namespace") or "" + nodes = entry.get("nodes") if isinstance(entry.get("nodes"), dict) else {} + primary = entry.get("primary_node") or "" + if not workload or not nodes: + return "" + parts = [] + if primary: + parts.append(f"{primary} (primary)") + for node, count in sorted(nodes.items(), key=lambda item: (-item[1], item[0])): + if node == primary: + continue + parts.append(f"{node} ({count} pod{'s' if count != 1 else ''})") + node_text = ", ".join(parts) if parts else primary + answer = f"{workload} runs in {namespace}. Nodes: {node_text}." + return _format_confidence(answer, "medium") + + +def _snapshot_metrics(snapshot: dict[str, Any] | None) -> dict[str, Any]: + if not snapshot: + return {} + metrics = snapshot.get("metrics") + return metrics if isinstance(metrics, dict) else {} + + +def _node_usage_top( + usage: list[dict[str, Any]], + *, + allowed_nodes: set[str] | None, +) -> tuple[str, float] | None: + best_node = "" + best_val = None + for item in usage if isinstance(usage, list) else []: + if not isinstance(item, dict): + continue + node = item.get("node") or "" + if allowed_nodes and node not in allowed_nodes: + continue + value = item.get("value") + try: + numeric = float(value) + except (TypeError, ValueError): + continue + if best_val is None or numeric > best_val: + best_val = numeric + best_node = node + if best_node and best_val is not None: + return best_node, best_val + return None + + +def snapshot_metric_answer( + prompt: str, + *, + snapshot: dict[str, Any] | None, + inventory: list[dict[str, Any]], +) -> str: + if not snapshot: + return "" + metrics = _snapshot_metrics(snapshot) + if not metrics: + return "" + q = normalize_query(prompt) + metric = _detect_metric(q) + op = _detect_operation(q) + include_hw, exclude_hw = _detect_hardware_filters(q) + nodes_in_query = _extract_titan_nodes(q) + only_workers = "worker" in q or "workers" in q + + filtered = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=only_workers, + only_ready=None, + nodes_in_query=nodes_in_query, + ) + allowed_nodes = {node["name"] for node in filtered} if filtered else None + + if metric in {"cpu", "ram", "net", "io"} and op in {"top", "status", None}: + usage = metrics.get("node_usage", {}).get(metric, []) + top = _node_usage_top(usage, allowed_nodes=allowed_nodes) + if top: + node, val = top + percent = metric in {"cpu", "ram"} + value = _format_metric_value(str(val), percent=percent) + scope = "" + if include_hw: + scope = f" among {' and '.join(sorted(include_hw))}" + answer = f"Hottest node{scope}: {node} ({value})." + return _format_confidence(answer, "high") + + if metric == "connections" or "postgres" in q: + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} + used = postgres.get("used") + max_conn = postgres.get("max") + hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} + parts: list[str] = [] + if used is not None and max_conn is not None: + parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.") + if hottest.get("label"): + hot_val = hottest.get("value") + hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else "" + parts.append(f"Hottest DB: {hottest.get('label')} ({hot_val_str}).") + if parts: + return _format_confidence(" ".join(parts), "high") + + return "" + +def structured_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + metrics_summary: str, + snapshot: dict[str, Any] | None = None, + workloads: list[dict[str, Any]] | None = None, +) -> str: q = normalize_query(prompt) if not q: return "" + if workloads: + workload_resp = workload_answer(prompt, workloads) + if workload_resp: + return workload_resp + + snap_resp = snapshot_metric_answer(prompt, snapshot=snapshot, inventory=inventory) + if snap_resp: + return snap_resp + tokens = _tokens(q) op = _detect_operation(q) metric = _detect_metric(q) @@ -749,11 +924,20 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s if op == "status": if "missing" in q and expected_workers: missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) - return "Missing nodes: " + (", ".join(missing) if missing else "none") + "." + return _format_confidence( + "Missing nodes: " + (", ".join(missing) if missing else "none") + ".", + "high", + ) if only_ready is False: - return "Not ready nodes: " + (", ".join(names) if names else "none") + "." + return _format_confidence( + "Not ready nodes: " + (", ".join(names) if names else "none") + ".", + "high", + ) if only_ready is True: - return f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + "." + return _format_confidence( + f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + ".", + "high", + ) if op == "count": if expected_workers and ("expected" in q or "should" in q): @@ -761,10 +945,10 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." if missing: msg += f" Missing: {', '.join(missing)}." - return msg + return _format_confidence(msg, "high") if not (include_hw or exclude_hw or nodes_in_query or only_workers): - return f"Atlas has {len(names)} nodes." - return f"Matching nodes: {len(names)}." + return _format_confidence(f"Atlas has {len(names)} nodes.", "high") + return _format_confidence(f"Matching nodes: {len(names)}.", "high") if op == "list": if nodes_in_query: @@ -772,12 +956,12 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s existing = {n["name"] for n in inventory} for node in nodes_in_query: parts.append(f"{node}: {'present' if node in existing else 'not present'}") - return "Node presence: " + ", ".join(parts) + "." + return _format_confidence("Node presence: " + ", ".join(parts) + ".", "high") if not names: - return "Matching nodes: none." + return _format_confidence("Matching nodes: none.", "high") shown = names[:30] suffix = f", … (+{len(names) - 30} more)" if len(names) > 30 else "" - return "Matching nodes: " + ", ".join(shown) + suffix + "." + return _format_confidence("Matching nodes: " + ", ".join(shown) + suffix + ".", "high") return "" @@ -922,6 +1106,58 @@ def _ariadne_state(timeout: int = 5) -> dict | None: except Exception: return None + +_SNAPSHOT_CACHE: dict[str, Any] = {"payload": None, "ts": 0.0} + + +def _snapshot_state() -> dict[str, Any] | None: + now = time.monotonic() + cached = _SNAPSHOT_CACHE.get("payload") + ts = _SNAPSHOT_CACHE.get("ts") or 0.0 + if cached and now - ts < max(5, SNAPSHOT_TTL_SEC): + return cached + payload = _ariadne_state(timeout=10) + if isinstance(payload, dict) and payload: + _SNAPSHOT_CACHE["payload"] = payload + _SNAPSHOT_CACHE["ts"] = now + return payload + return cached if isinstance(cached, dict) else None + + +def _snapshot_inventory(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]: + if not snapshot: + return [] + items = snapshot.get("nodes_detail") + if not isinstance(items, list): + return [] + inventory: list[dict[str, Any]] = [] + for node in items: + if not isinstance(node, dict): + continue + labels = node.get("labels") if isinstance(node.get("labels"), dict) else {} + name = node.get("name") or "" + if not name: + continue + hardware = node.get("hardware") or _hardware_class(labels) + inventory.append( + { + "name": name, + "arch": node.get("arch") or labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "", + "hardware": hardware, + "roles": node.get("roles") or [], + "is_worker": node.get("is_worker") is True, + "ready": node.get("ready") is True, + } + ) + return sorted(inventory, key=lambda item: item["name"]) + + +def _snapshot_workloads(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]: + if not snapshot: + return [] + workloads = snapshot.get("workloads") + return workloads if isinstance(workloads, list) else [] + def k8s_pods(namespace: str) -> list[dict]: data = k8s_get(f"/api/v1/namespaces/{parse.quote(namespace)}/pods?limit=500") items = data.get("items") or [] @@ -1079,25 +1315,11 @@ def _node_is_worker(node: dict) -> bool: return True return True -def worker_nodes_status() -> tuple[list[str], list[str]]: - try: - data = k8s_get("/api/v1/nodes?limit=500") - except Exception: - return ([], []) - items = data.get("items") or [] - ready_nodes: list[str] = [] - not_ready_nodes: list[str] = [] - for node in items if isinstance(items, list) else []: - if not _node_is_worker(node): - continue - name = (node.get("metadata") or {}).get("name") or "" - if not name: - continue - ready = _node_ready_status(node) - if ready is True: - ready_nodes.append(name) - elif ready is False: - not_ready_nodes.append(name) +def worker_nodes_status(inventory: list[dict[str, Any]] | None = None) -> tuple[list[str], list[str]]: + if inventory is None: + inventory = node_inventory() + ready_nodes = [n["name"] for n in inventory if n.get("is_worker") and n.get("ready") is True] + not_ready_nodes = [n["name"] for n in inventory if n.get("is_worker") and n.get("ready") is False] return (sorted(ready_nodes), sorted(not_ready_nodes)) def expected_worker_nodes_from_metrics() -> list[str]: @@ -1238,13 +1460,29 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): if not prompt: self._write_json(400, {"error": "missing_prompt"}) return - inventory = node_inventory_live() - answer = structured_answer(prompt, inventory=inventory, metrics_summary="") + snapshot = _snapshot_state() + inventory = _snapshot_inventory(snapshot) or node_inventory_live() + workloads = _snapshot_workloads(snapshot) + answer = structured_answer( + prompt, + inventory=inventory, + metrics_summary="", + snapshot=snapshot, + workloads=workloads, + ) if not answer and _knowledge_intent(prompt): answer = knowledge_summary(prompt, inventory) if not answer: kb = kb_retrieve_titles(prompt, limit=4) - answer = kb or "" + context = build_context( + prompt, + allow_tools=False, + targets=[], + inventory=inventory, + snapshot=snapshot, + ) + fallback = kb or "I don't have enough data to answer that." + answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback) self._write_json(200, {"answer": answer}) @@ -1266,6 +1504,7 @@ def build_context( allow_tools: bool, targets: list[tuple[str, str]], inventory: list[dict[str, Any]] | None = None, + snapshot: dict[str, Any] | None = None, ) -> str: parts: list[str] = [] @@ -1281,6 +1520,10 @@ def build_context( if node_ctx: parts.append(node_ctx) + snapshot_ctx = snapshot_context(prompt, snapshot) + if snapshot_ctx: + parts.append(snapshot_ctx) + if allow_tools: # Scope pod summaries to relevant namespaces/workloads when possible. prefixes_by_ns: dict[str, set[str]] = collections.defaultdict(set) @@ -1311,6 +1554,33 @@ def build_context( return "\n\n".join([p for p in parts if p]).strip() +def snapshot_context(prompt: str, snapshot: dict[str, Any] | None) -> str: + if not snapshot: + return "" + metrics = _snapshot_metrics(snapshot) + workloads = _snapshot_workloads(snapshot) + q = normalize_query(prompt) + parts: list[str] = [] + nodes = snapshot.get("nodes") if isinstance(snapshot.get("nodes"), dict) else {} + if nodes.get("total") is not None: + parts.append( + f"Snapshot: nodes_total={nodes.get('total')}, ready={nodes.get('ready')}, not_ready={nodes.get('not_ready')}." + ) + if any(word in q for word in ("postgres", "connections", "db")): + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} + if postgres: + parts.append(f"Snapshot: postgres_connections={postgres}.") + if any(word in q for word in ("hottest", "cpu", "ram", "memory", "net", "network", "io", "disk")): + hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + if hottest: + parts.append(f"Snapshot: hottest_nodes={hottest}.") + if workloads and any(word in q for word in ("run", "running", "host", "node", "where", "which")): + match = _select_workload(prompt, workloads) + if match: + parts.append(f"Snapshot: workload={match}.") + return "\n".join(parts).strip() + + def _knowledge_intent(prompt: str) -> bool: q = normalize_query(prompt) return any( @@ -1350,7 +1620,8 @@ def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str: kb_titles = kb_retrieve_titles(prompt, limit=4) if kb_titles: parts.append(kb_titles) - return "\n".join(parts).strip() + summary = "\n".join(parts).strip() + return _format_confidence(summary, "medium") if summary else "" def _ollama_call(hist_key, prompt: str, *, context: str) -> str: system = ( @@ -1360,7 +1631,8 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "Never include or request secret values. " "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " - "If the answer is not grounded in the provided context or tool data, say you do not know." + "If the answer is not grounded in the provided context or tool data, say you do not know. " + "End every response with a line: 'Confidence: high|medium|low'." ) transcript_parts = [system] if context: @@ -1491,8 +1763,18 @@ def sync_loop(token: str, room_id: str): if isinstance(w, dict) and w.get("name"): targets.append((ns, str(w["name"]))) + snapshot = _snapshot_state() inventory = node_inventory_for_prompt(body) - context = build_context(body, allow_tools=allow_tools, targets=targets, inventory=inventory) + if not inventory: + inventory = _snapshot_inventory(snapshot) + workloads = _snapshot_workloads(snapshot) + context = build_context( + body, + allow_tools=allow_tools, + targets=targets, + inventory=inventory, + snapshot=snapshot, + ) if allow_tools and promql: res = vm_query(promql, timeout=20) rendered = vm_render_result(res, limit=15) or "(no results)" @@ -1506,7 +1788,13 @@ def sync_loop(token: str, room_id: str): if not fallback and context: fallback = _context_fallback(context) - structured = structured_answer(body, inventory=inventory, metrics_summary=metrics_fallback or "") + structured = structured_answer( + body, + inventory=inventory, + metrics_summary=metrics_fallback or "", + snapshot=snapshot, + workloads=workloads, + ) if structured: send_msg(token, rid, structured) continue From 13f9fd425821a5c2416ec2118f531d12eab3408a Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 08:50:29 +0000 Subject: [PATCH 291/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 1392855b..0f8cd2a0 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-56 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-57 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From b73d4d6533ac36e49e97a7090c505517d524d52b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 09:00:40 +0000 Subject: [PATCH 292/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 0f8cd2a0..e4580aae 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-57 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-58 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 91d4da9397cdaf283c3a26776ec95fe3b8eff65d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 06:28:03 -0300 Subject: [PATCH 293/416] atlasbot: shift to facts context and upgrade model --- services/ai-llm/deployment.yaml | 4 +- services/comms/atlasbot-deployment.yaml | 4 +- services/comms/scripts/atlasbot/bot.py | 203 +++++++++++++++++------- 3 files changed, 151 insertions(+), 60 deletions(-) diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml index 43d14c81..bf012c0b 100644 --- a/services/ai-llm/deployment.yaml +++ b/services/ai-llm/deployment.yaml @@ -20,7 +20,7 @@ spec: labels: app: ollama annotations: - ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0 + ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0 ai.bstein.dev/gpu: GPU pool (titan-22/24) ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z" spec: @@ -52,7 +52,7 @@ spec: - name: OLLAMA_MODELS value: /root/.ollama - name: OLLAMA_MODEL - value: qwen2.5:7b-instruct-q4_0 + value: qwen2.5:14b-instruct-q4_0 command: - /bin/sh - -c diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 0ee86f01..f4883c41 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-29 + checksum/atlasbot-configmap: manual-atlasbot-30 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -82,7 +82,7 @@ spec: - name: OLLAMA_URL value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL - value: qwen2.5:7b-instruct-q4_0 + value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC value: "600" - name: ATLASBOT_THINKING_INTERVAL_SEC diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 9f6c38dc..a91744dd 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -33,7 +33,10 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500")) +MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000")) THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120")) +OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2")) +OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false" TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE) HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)") @@ -113,6 +116,8 @@ METRIC_HINTS = { "connections": ("connections", "conn", "postgres", "database", "db"), } +_OLLAMA_LOCK = threading.Lock() + HARDWARE_HINTS = { "amd64": ("amd64", "x86", "x86_64", "x86-64"), "jetson": ("jetson",), @@ -638,6 +643,105 @@ def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]: return node_inventory() return [] +def _nodes_by_arch(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: + grouped: dict[str, list[str]] = collections.defaultdict(list) + for node in inventory: + grouped[(node.get("arch") or "unknown")].append(node["name"]) + return {k: sorted(v) for k, v in grouped.items()} + +def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]: + usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} + per_node: dict[str, dict[str, Any]] = {} + for metric_name, entries in usage.items() if isinstance(usage, dict) else []: + if not isinstance(entries, list): + continue + for entry in entries: + if not isinstance(entry, dict): + continue + node = entry.get("node") + if not isinstance(node, str) or not node: + continue + per_node.setdefault(node, {})[metric_name] = entry.get("value") + return [{"node": node, **vals} for node, vals in sorted(per_node.items())] + +def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> list[dict[str, Any]]: + cleaned: list[dict[str, Any]] = [] + for entry in workloads: + if not isinstance(entry, dict): + continue + cleaned.append( + { + "namespace": entry.get("namespace"), + "workload": entry.get("workload"), + "pods_total": entry.get("pods_total"), + "pods_running": entry.get("pods_running"), + "primary_node": entry.get("primary_node"), + "nodes": entry.get("nodes"), + } + ) + cleaned.sort( + key=lambda item: ( + -(item.get("pods_total") or 0), + str(item.get("namespace") or ""), + str(item.get("workload") or ""), + ) + ) + return cleaned[:limit] + +def facts_context( + prompt: str, + *, + inventory: list[dict[str, Any]] | None, + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]] | None, +) -> str: + inv = inventory or [] + metrics = _snapshot_metrics(snapshot) + nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {} + summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {} + expected_workers = expected_worker_nodes_from_metrics() + ready_workers, not_ready_workers = worker_nodes_status(inv) if inv else ([], []) + + facts: dict[str, Any] = { + "generated_at": snapshot.get("generated_at") if isinstance(snapshot, dict) else None, + "nodes": { + "total": summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total"), + "ready": summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready"), + "not_ready": summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready"), + "not_ready_names": summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names"), + "by_hardware": _group_nodes(inv) if inv else {}, + "by_arch": _nodes_by_arch(inv) if inv else {}, + "workers_ready": ready_workers, + "workers_not_ready": not_ready_workers, + "expected_workers": expected_workers, + }, + "metrics": { + "hottest_nodes": metrics.get("hottest_nodes") if isinstance(metrics, dict) else {}, + "postgres_connections": metrics.get("postgres_connections") if isinstance(metrics, dict) else {}, + "node_usage": _node_usage_table(metrics) if isinstance(metrics, dict) else [], + }, + "workloads": _workloads_for_facts(workloads or []), + } + + rendered = json.dumps(facts, ensure_ascii=False) + if len(rendered) <= MAX_FACTS_CHARS: + return "Facts (live snapshot):\n" + rendered + + trimmed = dict(facts) + trimmed.pop("workloads", None) + rendered = json.dumps(trimmed, ensure_ascii=False) + if len(rendered) <= MAX_FACTS_CHARS: + return "Facts (live snapshot):\n" + rendered + + trimmed_metrics = dict(trimmed.get("metrics") or {}) + trimmed_metrics.pop("node_usage", None) + trimmed["metrics"] = trimmed_metrics + rendered = json.dumps(trimmed, ensure_ascii=False) + if len(rendered) <= MAX_FACTS_CHARS: + return "Facts (live snapshot):\n" + rendered + + return "Facts (live snapshot):\n" + rendered[:MAX_FACTS_CHARS] + def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: names = [node["name"] for node in inventory] ready = [node["name"] for node in inventory if node.get("ready") is True] @@ -1463,26 +1567,19 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) - answer = structured_answer( + context = build_context( prompt, + allow_tools=False, + targets=[], inventory=inventory, - metrics_summary="", snapshot=snapshot, workloads=workloads, ) - if not answer and _knowledge_intent(prompt): - answer = knowledge_summary(prompt, inventory) - if not answer: - kb = kb_retrieve_titles(prompt, limit=4) - context = build_context( - prompt, - allow_tools=False, - targets=[], - inventory=inventory, - snapshot=snapshot, - ) - fallback = kb or "I don't have enough data to answer that." - answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback) + metrics_context, metrics_fallback = metrics_query_context(prompt, allow_tools=True) + if metrics_context: + context = (context + "\n\n" + metrics_context).strip() if context else metrics_context + fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that." + answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback) self._write_json(200, {"answer": answer}) @@ -1505,10 +1602,13 @@ def build_context( targets: list[tuple[str, str]], inventory: list[dict[str, Any]] | None = None, snapshot: dict[str, Any] | None = None, + workloads: list[dict[str, Any]] | None = None, ) -> str: parts: list[str] = [] kb = kb_retrieve(prompt) + if not kb and _knowledge_intent(prompt): + kb = kb_retrieve_titles(prompt, limit=4) if kb: parts.append(kb) @@ -1516,13 +1616,9 @@ def build_context( if endpoints: parts.append(endpoints) - node_ctx = node_inventory_context(prompt, inventory) - if node_ctx: - parts.append(node_ctx) - - snapshot_ctx = snapshot_context(prompt, snapshot) - if snapshot_ctx: - parts.append(snapshot_ctx) + facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + if facts: + parts.append(facts) if allow_tools: # Scope pod summaries to relevant namespaces/workloads when possible. @@ -1627,7 +1723,9 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: system = ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " - "Prefer answering with exact repo paths and Kubernetes resource names. " + "Use the provided context and facts as your source of truth. " + "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. " + "Prefer exact repo paths and Kubernetes resource names when relevant. " "Never include or request secret values. " "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " @@ -1646,21 +1744,32 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: if API_KEY: headers["x-api-key"] = API_KEY r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) - with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: - data = json.loads(resp.read().decode()) - raw_reply = data.get("message") or data.get("response") or data.get("reply") or data - reply = _normalize_reply(raw_reply) or "I'm here to help." - history[hist_key].append(f"Atlas: {reply}") - return reply + lock = _OLLAMA_LOCK if OLLAMA_SERIALIZE else None + if lock: + lock.acquire() + try: + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: + data = json.loads(resp.read().decode()) + raw_reply = data.get("message") or data.get("response") or data.get("reply") or data + reply = _normalize_reply(raw_reply) or "I'm here to help." + history[hist_key].append(f"Atlas: {reply}") + return reply + finally: + if lock: + lock.release() def ollama_reply(hist_key, prompt: str, *, context: str, fallback: str = "") -> str: - try: - return _ollama_call(hist_key, prompt, context=context) - except Exception: - if fallback: - history[hist_key].append(f"Atlas: {fallback}") - return fallback - return "Model backend is busy. Try again in a moment." + last_error = None + for attempt in range(max(1, OLLAMA_RETRIES + 1)): + try: + return _ollama_call(hist_key, prompt, context=context) + except Exception as exc: # noqa: BLE001 + last_error = exc + time.sleep(min(4, 2 ** attempt)) + if fallback: + history[hist_key].append(f"Atlas: {fallback}") + return fallback + return "I don't have enough data to answer that." def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, context: str, fallback: str) -> str: result: dict[str, str] = {"reply": ""} @@ -1774,6 +1883,7 @@ def sync_loop(token: str, room_id: str): targets=targets, inventory=inventory, snapshot=snapshot, + workloads=workloads, ) if allow_tools and promql: res = vm_query(promql, timeout=20) @@ -1784,26 +1894,7 @@ def sync_loop(token: str, room_id: str): if metrics_context: context = (context + "\n\n" + metrics_context).strip() if context else metrics_context - fallback = metrics_fallback or "" - if not fallback and context: - fallback = _context_fallback(context) - - structured = structured_answer( - body, - inventory=inventory, - metrics_summary=metrics_fallback or "", - snapshot=snapshot, - workloads=workloads, - ) - if structured: - send_msg(token, rid, structured) - continue - - if _knowledge_intent(body): - summary = knowledge_summary(body, inventory) - if summary: - send_msg(token, rid, summary) - continue + fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that." reply = ollama_reply_with_thinking( token, From 70b313ce1e8e57ef1b25e35aef4421bd142b76e6 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 06:34:37 -0300 Subject: [PATCH 294/416] atlasbot: enrich facts summary for LLM --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 46 +++++++++++++++++++++---- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index f4883c41..377a076e 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-30 + checksum/atlasbot-configmap: manual-atlasbot-31 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index a91744dd..3f055292 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -723,24 +723,55 @@ def facts_context( "workloads": _workloads_for_facts(workloads or []), } + summary_lines: list[str] = [] + nodes_info = facts.get("nodes") if isinstance(facts.get("nodes"), dict) else {} + if nodes_info.get("total") is not None: + summary_lines.append( + f"nodes_total={nodes_info.get('total')}, ready={nodes_info.get('ready')}, not_ready={nodes_info.get('not_ready')}" + ) + hottest = facts.get("metrics", {}).get("hottest_nodes") if isinstance(facts.get("metrics"), dict) else {} + if isinstance(hottest, dict) and hottest: + for key in ("cpu", "ram", "net", "io"): + entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} + node = entry.get("node") + value = entry.get("value") + if node and value is not None: + summary_lines.append(f"hottest_{key}={node} ({value})") + postgres = facts.get("metrics", {}).get("postgres_connections") if isinstance(facts.get("metrics"), dict) else {} + if isinstance(postgres, dict) and postgres: + used = postgres.get("used") + max_conn = postgres.get("max") + if used is not None and max_conn is not None: + summary_lines.append(f"postgres_used={used}, postgres_max={max_conn}") + hottest_db = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} + if hottest_db.get("label"): + summary_lines.append(f"postgres_hottest_db={hottest_db.get('label')} ({hottest_db.get('value')})") + rendered = json.dumps(facts, ensure_ascii=False) - if len(rendered) <= MAX_FACTS_CHARS: - return "Facts (live snapshot):\n" + rendered + rendered_parts = [] + if summary_lines: + rendered_parts.append("Facts summary:\n" + "\n".join(f"- {line}" for line in summary_lines)) + rendered_parts.append("Facts (live snapshot JSON):\n" + rendered) + combined = "\n\n".join(rendered_parts) + if len(combined) <= MAX_FACTS_CHARS: + return combined trimmed = dict(facts) trimmed.pop("workloads", None) rendered = json.dumps(trimmed, ensure_ascii=False) - if len(rendered) <= MAX_FACTS_CHARS: - return "Facts (live snapshot):\n" + rendered + combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered]) + if len(combined) <= MAX_FACTS_CHARS: + return combined trimmed_metrics = dict(trimmed.get("metrics") or {}) trimmed_metrics.pop("node_usage", None) trimmed["metrics"] = trimmed_metrics rendered = json.dumps(trimmed, ensure_ascii=False) - if len(rendered) <= MAX_FACTS_CHARS: - return "Facts (live snapshot):\n" + rendered + combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered]) + if len(combined) <= MAX_FACTS_CHARS: + return combined - return "Facts (live snapshot):\n" + rendered[:MAX_FACTS_CHARS] + return combined[:MAX_FACTS_CHARS] def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: names = [node["name"] for node in inventory] @@ -1724,6 +1755,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " "Use the provided context and facts as your source of truth. " + "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. " "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. " "Prefer exact repo paths and Kubernetes resource names when relevant. " "Never include or request secret values. " From a8ea436fcff2c3087c5f4a7776597a1db602b5e2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 06:45:18 -0300 Subject: [PATCH 295/416] atlasbot: shrink facts context to avoid truncation --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 148 ++++++++++++++---------- 2 files changed, 89 insertions(+), 61 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 377a076e..7cb2d7da 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-31 + checksum/atlasbot-configmap: manual-atlasbot-32 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 3f055292..9e8e0ddd 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -688,6 +688,20 @@ def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> li ) return cleaned[:limit] +def _workloads_for_prompt(prompt: str, workloads: list[dict[str, Any]], limit: int = 12) -> list[dict[str, Any]]: + tokens = set(_tokens(prompt)) + if tokens: + matched: list[dict[str, Any]] = [] + for entry in workloads: + if not isinstance(entry, dict): + continue + entry_tokens = _workload_tokens(entry) + if entry_tokens & tokens: + matched.append(entry) + if matched: + return _workloads_for_facts(matched, limit=limit) + return _workloads_for_facts(workloads, limit=limit) + def facts_context( prompt: str, *, @@ -701,77 +715,91 @@ def facts_context( summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {} expected_workers = expected_worker_nodes_from_metrics() ready_workers, not_ready_workers = worker_nodes_status(inv) if inv else ([], []) + total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total") + ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready") + not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready") + not_ready_names = summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names") + by_hardware = _group_nodes(inv) if inv else {} + by_arch = _nodes_by_arch(inv) if inv else {} - facts: dict[str, Any] = { - "generated_at": snapshot.get("generated_at") if isinstance(snapshot, dict) else None, - "nodes": { - "total": summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total"), - "ready": summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready"), - "not_ready": summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready"), - "not_ready_names": summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names"), - "by_hardware": _group_nodes(inv) if inv else {}, - "by_arch": _nodes_by_arch(inv) if inv else {}, - "workers_ready": ready_workers, - "workers_not_ready": not_ready_workers, - "expected_workers": expected_workers, - }, - "metrics": { - "hottest_nodes": metrics.get("hottest_nodes") if isinstance(metrics, dict) else {}, - "postgres_connections": metrics.get("postgres_connections") if isinstance(metrics, dict) else {}, - "node_usage": _node_usage_table(metrics) if isinstance(metrics, dict) else [], - }, - "workloads": _workloads_for_facts(workloads or []), - } - - summary_lines: list[str] = [] - nodes_info = facts.get("nodes") if isinstance(facts.get("nodes"), dict) else {} - if nodes_info.get("total") is not None: - summary_lines.append( - f"nodes_total={nodes_info.get('total')}, ready={nodes_info.get('ready')}, not_ready={nodes_info.get('not_ready')}" + lines: list[str] = ["Facts (live snapshot):"] + if total is not None: + lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}") + if not_ready_names: + lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}") + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + nodes_list = by_hardware.get(key) or [] + if nodes_list: + lines.append(f"- {key}: {', '.join(nodes_list)}") + for key, nodes_list in sorted(by_arch.items()): + if nodes_list: + lines.append(f"- arch {key}: {', '.join(nodes_list)}") + if ready_workers or not_ready_workers: + lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}") + if not_ready_workers: + lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}") + if expected_workers: + missing = sorted( + set(expected_workers) + - {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")} ) - hottest = facts.get("metrics", {}).get("hottest_nodes") if isinstance(facts.get("metrics"), dict) else {} - if isinstance(hottest, dict) and hottest: - for key in ("cpu", "ram", "net", "io"): - entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} - node = entry.get("node") - value = entry.get("value") - if node and value is not None: - summary_lines.append(f"hottest_{key}={node} ({value})") - postgres = facts.get("metrics", {}).get("postgres_connections") if isinstance(facts.get("metrics"), dict) else {} + lines.append(f"- expected_workers: {', '.join(expected_workers)}") + if missing: + lines.append(f"- expected_workers_missing: {', '.join(missing)}") + + hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + for key in ("cpu", "ram", "net", "io"): + entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} + node = entry.get("node") + value = entry.get("value") + if node and value is not None: + lines.append(f"- hottest_{key}: {node} ({value})") + + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} if isinstance(postgres, dict) and postgres: used = postgres.get("used") max_conn = postgres.get("max") if used is not None and max_conn is not None: - summary_lines.append(f"postgres_used={used}, postgres_max={max_conn}") + lines.append(f"- postgres_connections: {used} used / {max_conn} max") hottest_db = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} if hottest_db.get("label"): - summary_lines.append(f"postgres_hottest_db={hottest_db.get('label')} ({hottest_db.get('value')})") + lines.append( + f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})" + ) - rendered = json.dumps(facts, ensure_ascii=False) - rendered_parts = [] - if summary_lines: - rendered_parts.append("Facts summary:\n" + "\n".join(f"- {line}" for line in summary_lines)) - rendered_parts.append("Facts (live snapshot JSON):\n" + rendered) - combined = "\n\n".join(rendered_parts) - if len(combined) <= MAX_FACTS_CHARS: - return combined + usage_table = _node_usage_table(metrics) + if usage_table: + lines.append("- node_usage (cpu/ram/net/io):") + for entry in usage_table: + node = entry.get("node") + if not node: + continue + cpu = entry.get("cpu") + ram = entry.get("ram") + net = entry.get("net") + io_val = entry.get("io") + lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") - trimmed = dict(facts) - trimmed.pop("workloads", None) - rendered = json.dumps(trimmed, ensure_ascii=False) - combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered]) - if len(combined) <= MAX_FACTS_CHARS: - return combined + workload_entries = _workloads_for_prompt(prompt, workloads or []) + if workload_entries: + lines.append("- workloads:") + for entry in workload_entries: + if not isinstance(entry, dict): + continue + ns = entry.get("namespace") or "" + wl = entry.get("workload") or "" + primary = entry.get("primary_node") or "" + pods_total = entry.get("pods_total") + label = f"{ns}/{wl}" if ns and wl else (wl or ns) + if not label: + continue + if primary: + lines.append(f" - {label}: primary_node={primary}, pods_total={pods_total}") + else: + lines.append(f" - {label}: pods_total={pods_total}") - trimmed_metrics = dict(trimmed.get("metrics") or {}) - trimmed_metrics.pop("node_usage", None) - trimmed["metrics"] = trimmed_metrics - rendered = json.dumps(trimmed, ensure_ascii=False) - combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered]) - if len(combined) <= MAX_FACTS_CHARS: - return combined - - return combined[:MAX_FACTS_CHARS] + rendered = "\n".join(lines) + return rendered[:MAX_FACTS_CHARS] def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: names = [node["name"] for node in inventory] From 8f05dc9b0261a679c617ab2c3d99490d376dd85b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 11:03:55 -0300 Subject: [PATCH 296/416] atlasbot: strengthen facts context and replies --- services/comms/scripts/atlasbot/bot.py | 91 +++++++++++++++++++------- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 9e8e0ddd..e0056f8a 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -34,6 +34,7 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500")) MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000")) +MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000")) THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120")) OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2")) OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false" @@ -100,6 +101,7 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE) TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE) _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D" +CONFIDENCE_RE = re.compile(r"confidence\s*:\s*(high|medium|low)\b", re.IGNORECASE) OPERATION_HINTS = { "count": ("how many", "count", "number", "total"), @@ -139,6 +141,20 @@ def _tokens(text: str) -> list[str]: return [t for t in toks if t not in STOPWORDS and len(t) >= 2] +def _ensure_confidence(text: str) -> str: + if not text: + return "" + lines = text.strip().splitlines() + for idx, line in enumerate(lines): + match = CONFIDENCE_RE.search(line) + if match: + level = match.group(1).lower() + lines[idx] = f"Confidence: {level}" + return "\n".join(lines) + lines.append("Confidence: medium") + return "\n".join(lines) + + # Mention detection (Matrix rich mentions + plain @atlas). MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()] MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS] @@ -710,6 +726,7 @@ def facts_context( workloads: list[dict[str, Any]] | None, ) -> str: inv = inventory or [] + nodes_in_query = _extract_titan_nodes(prompt) metrics = _snapshot_metrics(snapshot) nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {} summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {} @@ -721,6 +738,12 @@ def facts_context( not_ready_names = summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names") by_hardware = _group_nodes(inv) if inv else {} by_arch = _nodes_by_arch(inv) if inv else {} + control_plane_nodes = [ + node["name"] + for node in inv + if any(role in ("control-plane", "master") for role in (node.get("roles") or [])) + ] + worker_nodes = [node["name"] for node in inv if node.get("is_worker") is True] lines: list[str] = ["Facts (live snapshot):"] if total is not None: @@ -731,9 +754,16 @@ def facts_context( nodes_list = by_hardware.get(key) or [] if nodes_list: lines.append(f"- {key}: {', '.join(nodes_list)}") + non_rpi = sorted(set(by_hardware.get("jetson", [])) | set(by_hardware.get("amd64", []))) + if non_rpi: + lines.append(f"- non_raspberry_pi: {', '.join(non_rpi)}") for key, nodes_list in sorted(by_arch.items()): if nodes_list: lines.append(f"- arch {key}: {', '.join(nodes_list)}") + if control_plane_nodes: + lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}") + if worker_nodes: + lines.append(f"- worker_nodes: {', '.join(worker_nodes)}") if ready_workers or not_ready_workers: lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}") if not_ready_workers: @@ -753,7 +783,8 @@ def facts_context( node = entry.get("node") value = entry.get("value") if node and value is not None: - lines.append(f"- hottest_{key}: {node} ({value})") + value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram")) + lines.append(f"- hottest_{key}: {node} ({value_fmt})") postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} if isinstance(postgres, dict) and postgres: @@ -774,12 +805,25 @@ def facts_context( node = entry.get("node") if not node: continue - cpu = entry.get("cpu") - ram = entry.get("ram") - net = entry.get("net") - io_val = entry.get("io") + cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else "" + ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else "" + net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else "" + io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else "" lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") + if nodes_in_query: + lines.append("- node_details:") + for name in nodes_in_query: + detail = next((n for n in inv if n.get("name") == name), None) + if not detail: + lines.append(f" - {name}: not found in snapshot") + continue + roles = ",".join(detail.get("roles") or []) or "none" + lines.append( + f" - {name}: hardware={detail.get('hardware')}, arch={detail.get('arch')}, " + f"ready={detail.get('ready')}, roles={roles}" + ) + workload_entries = _workloads_for_prompt(prompt, workloads or []) if workload_entries: lines.append("- workloads:") @@ -1181,11 +1225,10 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: if rendered: rendered_parts.append(rendered) if not rendered_parts: - return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data." + return "", "" summary = "\n".join(rendered_parts) context = f"Metrics (from {dashboard} / {panel}):\n{summary}" - fallback = _metrics_fallback_summary(panel, summary) - return context, fallback + return context, "" def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]: q = (query or "").strip() @@ -1574,8 +1617,8 @@ def _normalize_reply(value: Any) -> str: try: return _normalize_reply(json.loads(text)) except Exception: - return text - return text + return _ensure_confidence(text) + return _ensure_confidence(text) # Internal HTTP endpoint for cluster answers (website uses this). @@ -1634,10 +1677,10 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot=snapshot, workloads=workloads, ) - metrics_context, metrics_fallback = metrics_query_context(prompt, allow_tools=True) + metrics_context, _metrics_fallback = metrics_query_context(prompt, allow_tools=True) if metrics_context: context = (context + "\n\n" + metrics_context).strip() if context else metrics_context - fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that." + fallback = "I don't have enough data to answer that." answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback) self._write_json(200, {"answer": answer}) @@ -1665,19 +1708,19 @@ def build_context( ) -> str: parts: list[str] = [] - kb = kb_retrieve(prompt) - if not kb and _knowledge_intent(prompt): - kb = kb_retrieve_titles(prompt, limit=4) - if kb: - parts.append(kb) + facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + if facts: + parts.append(facts) endpoints, edges = catalog_hints(prompt) if endpoints: parts.append(endpoints) - facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) - if facts: - parts.append(facts) + kb = kb_retrieve(prompt) + if not kb and _knowledge_intent(prompt): + kb = kb_retrieve_titles(prompt, limit=4) + if kb: + parts.append(kb) if allow_tools: # Scope pod summaries to relevant namespaces/workloads when possible. @@ -1789,12 +1832,14 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "Never include or request secret values. " "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " + "Translate metrics into natural language instead of echoing raw label/value pairs. " + "Do not answer by only listing runbooks; summarize the cluster first and mention docs only if useful. " "If the answer is not grounded in the provided context or tool data, say you do not know. " "End every response with a line: 'Confidence: high|medium|low'." ) transcript_parts = [system] if context: - transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS]) + transcript_parts.append("Context (grounded):\n" + context[:MAX_CONTEXT_CHARS]) transcript_parts.extend(history[hist_key][-24:]) transcript_parts.append(f"User: {prompt}") transcript = "\n".join(transcript_parts) @@ -1950,11 +1995,11 @@ def sync_loop(token: str, room_id: str): rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra - metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics) + metrics_context, _metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics) if metrics_context: context = (context + "\n\n" + metrics_context).strip() if context else metrics_context - fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that." + fallback = "I don't have enough data to answer that." reply = ollama_reply_with_thinking( token, From 677230ebeb5fc1d1c6f58c755cccb60f40a48803 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 11:05:30 -0300 Subject: [PATCH 297/416] comms: bump atlasbot configmap checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7cb2d7da..93b5108f 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-32 + checksum/atlasbot-configmap: manual-atlasbot-33 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 3efbe161ac98c62bd8a5d7c6c382e47044c8d64d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:20:50 -0300 Subject: [PATCH 298/416] comms: point atlasbot to ollama and raise gateway memory --- services/bstein-dev-home/chat-ai-gateway-deployment.yaml | 4 ++-- services/comms/atlasbot-deployment.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml index 7209da62..e5724067 100644 --- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml +++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml @@ -67,10 +67,10 @@ spec: resources: requests: cpu: 20m - memory: 64Mi + memory: 128Mi limits: cpu: 200m - memory: 256Mi + memory: 512Mi volumeMounts: - name: code mountPath: /app/gateway.py diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 93b5108f..d41f97cf 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -80,7 +80,7 @@ spec: - name: BOT_MENTIONS value: atlasbot,aatlasbot - name: OLLAMA_URL - value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ + value: http://ollama.ai.svc.cluster.local:11434/ - name: OLLAMA_MODEL value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC From 9e1b2997ce94a2461018a24f731136c4f27e190c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:23:05 -0300 Subject: [PATCH 299/416] comms: restore atlasbot gateway URL --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d41f97cf..93b5108f 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -80,7 +80,7 @@ spec: - name: BOT_MENTIONS value: atlasbot,aatlasbot - name: OLLAMA_URL - value: http://ollama.ai.svc.cluster.local:11434/ + value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC From 612f71c5c4e75fac897e67cfd17c9e26533e510d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:33:56 -0300 Subject: [PATCH 300/416] atlasbot: call ollama chat directly --- services/comms/atlasbot-deployment.yaml | 4 +- services/comms/scripts/atlasbot/bot.py | 55 +++++++++++++++++++++---- 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 93b5108f..7ec373fd 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-33 + checksum/atlasbot-configmap: manual-atlasbot-34 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -80,7 +80,7 @@ spec: - name: BOT_MENTIONS value: atlasbot,aatlasbot - name: OLLAMA_URL - value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ + value: http://ollama.ai.svc.cluster.local:11434 - name: OLLAMA_MODEL value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e0056f8a..6644afb7 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -155,6 +155,37 @@ def _ensure_confidence(text: str) -> str: return "\n".join(lines) +def _ollama_endpoint() -> str: + url = (OLLAMA_URL or "").strip() + if not url: + return "" + if url.endswith("/api/chat"): + return url + return url.rstrip("/") + "/api/chat" + + +def _history_to_messages(lines: list[str]) -> list[dict[str, str]]: + messages: list[dict[str, str]] = [] + for line in lines: + raw = (line or "").strip() + if not raw: + continue + role = "user" + content = raw + lowered = raw.lower() + if lowered.startswith("atlas:"): + role = "assistant" + content = raw.split(":", 1)[1].strip() + elif lowered.startswith("user:"): + role = "user" + content = raw.split(":", 1)[1].strip() + elif ":" in raw: + content = raw.split(":", 1)[1].strip() + if content: + messages.append({"role": role, "content": content}) + return messages + + # Mention detection (Matrix rich mentions + plain @atlas). MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()] MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS] @@ -1837,25 +1868,33 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "If the answer is not grounded in the provided context or tool data, say you do not know. " "End every response with a line: 'Confidence: high|medium|low'." ) - transcript_parts = [system] + endpoint = _ollama_endpoint() + if not endpoint: + raise RuntimeError("ollama endpoint missing") + system_content = system if context: - transcript_parts.append("Context (grounded):\n" + context[:MAX_CONTEXT_CHARS]) - transcript_parts.extend(history[hist_key][-24:]) - transcript_parts.append(f"User: {prompt}") - transcript = "\n".join(transcript_parts) + system_content += "\n\nContext (grounded):\n" + context[:MAX_CONTEXT_CHARS] - payload = {"model": MODEL, "message": transcript} + messages: list[dict[str, str]] = [{"role": "system", "content": system_content}] + messages.extend(_history_to_messages(history[hist_key][-24:])) + messages.append({"role": "user", "content": prompt}) + + payload = {"model": MODEL, "messages": messages, "stream": False} headers = {"Content-Type": "application/json"} if API_KEY: headers["x-api-key"] = API_KEY - r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers) + r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers) lock = _OLLAMA_LOCK if OLLAMA_SERIALIZE else None if lock: lock.acquire() try: with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: data = json.loads(resp.read().decode()) - raw_reply = data.get("message") or data.get("response") or data.get("reply") or data + msg = data.get("message") if isinstance(data, dict) else None + if isinstance(msg, dict): + raw_reply = msg.get("content") + else: + raw_reply = data.get("response") or data.get("reply") or data reply = _normalize_reply(raw_reply) or "I'm here to help." history[hist_key].append(f"Atlas: {reply}") return reply From b7aa47d15cef6801ca8988643413dce833a91d58 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:47:28 -0300 Subject: [PATCH 301/416] atlasbot: preserve response text with confidence --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7ec373fd..b3e617d5 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-34 + checksum/atlasbot-configmap: manual-atlasbot-35 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6644afb7..c790f5c5 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -149,7 +149,7 @@ def _ensure_confidence(text: str) -> str: match = CONFIDENCE_RE.search(line) if match: level = match.group(1).lower() - lines[idx] = f"Confidence: {level}" + lines[idx] = CONFIDENCE_RE.sub(f"Confidence: {level}", line) return "\n".join(lines) lines.append("Confidence: medium") return "\n".join(lines) From 2edbef8774026530555e9779f7a4f4bc819632d8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:53:17 -0300 Subject: [PATCH 302/416] atlasbot: enrich snapshot facts and pod metrics --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 56 ++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index b3e617d5..fd2f3992 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-35 + checksum/atlasbot-configmap: manual-atlasbot-36 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index c790f5c5..03306204 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -95,6 +95,8 @@ METRIC_HINT_WORDS = { "pending", "unreachable", "latency", + "pod", + "pods", } CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) @@ -116,6 +118,7 @@ METRIC_HINTS = { "net": ("net", "network", "bandwidth", "throughput"), "io": ("io", "disk", "storage"), "connections": ("connections", "conn", "postgres", "database", "db"), + "pods": ("pods", "pod"), } _OLLAMA_LOCK = threading.Lock() @@ -488,13 +491,15 @@ def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool: return "* 100" in expr or "*100" in expr -def _format_metric_value(value: str, *, percent: bool) -> str: +def _format_metric_value(value: str, *, percent: bool, rate: bool = False) -> str: try: num = float(value) except (TypeError, ValueError): return value if percent: return f"{num:.1f}%" + if rate: + return _humanize_rate(value, unit="rate") if abs(num) >= 1: return f"{num:.2f}".rstrip("0").rstrip(".") return f"{num:.4f}".rstrip("0").rstrip(".") @@ -779,6 +784,11 @@ def facts_context( lines: list[str] = ["Facts (live snapshot):"] if total is not None: lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}") + if isinstance(summary, dict): + by_arch_counts = summary.get("by_arch") + if isinstance(by_arch_counts, dict) and by_arch_counts: + parts = [f"{arch}={count}" for arch, count in sorted(by_arch_counts.items())] + lines.append(f"- nodes_by_arch: {', '.join(parts)}") if not_ready_names: lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}") for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): @@ -799,7 +809,7 @@ def facts_context( lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}") if not_ready_workers: lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}") - if expected_workers: + if expected_workers and any(word in normalize_query(prompt) for word in ("missing", "expected", "should", "not ready", "unready")): missing = sorted( set(expected_workers) - {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")} @@ -814,7 +824,11 @@ def facts_context( node = entry.get("node") value = entry.get("value") if node and value is not None: - value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram")) + value_fmt = _format_metric_value( + str(value), + percent=key in ("cpu", "ram"), + rate=key in ("net", "io"), + ) lines.append(f"- hottest_{key}: {node} ({value_fmt})") postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} @@ -829,6 +843,11 @@ def facts_context( f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})" ) + for key in ("pods_running", "pods_pending", "pods_failed", "pods_succeeded"): + value = metrics.get(key) + if value is not None: + lines.append(f"- {key}: {value}") + usage_table = _node_usage_table(metrics) if usage_table: lines.append("- node_usage (cpu/ram/net/io):") @@ -838,8 +857,16 @@ def facts_context( continue cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else "" ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else "" - net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else "" - io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else "" + net = ( + _format_metric_value(str(entry.get("net")), percent=False, rate=True) + if entry.get("net") is not None + else "" + ) + io_val = ( + _format_metric_value(str(entry.get("io")), percent=False, rate=True) + if entry.get("io") is not None + else "" + ) lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") if nodes_in_query: @@ -1029,7 +1056,7 @@ def snapshot_metric_answer( if top: node, val = top percent = metric in {"cpu", "ram"} - value = _format_metric_value(str(val), percent=percent) + value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"}) scope = "" if include_hw: scope = f" among {' and '.join(sorted(include_hw))}" @@ -1051,6 +1078,23 @@ def snapshot_metric_answer( if parts: return _format_confidence(" ".join(parts), "high") + if metric == "pods": + running = metrics.get("pods_running") + pending = metrics.get("pods_pending") + failed = metrics.get("pods_failed") + succeeded = metrics.get("pods_succeeded") + parts = [] + if running is not None: + parts.append(f"running {running:.0f}") + if pending is not None: + parts.append(f"pending {pending:.0f}") + if failed is not None: + parts.append(f"failed {failed:.0f}") + if succeeded is not None: + parts.append(f"succeeded {succeeded:.0f}") + if parts: + return _format_confidence(f"Pods: {', '.join(parts)}.", "high") + return "" def structured_answer( From fb6d3b515ce3c80d063fc34bd5ab412ec6ea0e80 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 12:59:11 -0300 Subject: [PATCH 303/416] atlasbot: use structured answers before LLM --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index fd2f3992..7fdbf649 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-36 + checksum/atlasbot-configmap: manual-atlasbot-37 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 03306204..ff528ea0 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1744,6 +1744,17 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) + metrics_summary = snapshot_context(prompt, snapshot) + structured = structured_answer( + prompt, + inventory=inventory, + metrics_summary=metrics_summary, + snapshot=snapshot, + workloads=workloads, + ) + if structured: + self._write_json(200, {"answer": structured}) + return context = build_context( prompt, allow_tools=False, @@ -2065,6 +2076,19 @@ def sync_loop(token: str, room_id: str): if not inventory: inventory = _snapshot_inventory(snapshot) workloads = _snapshot_workloads(snapshot) + metrics_summary = snapshot_context(body, snapshot) + structured = structured_answer( + body, + inventory=inventory, + metrics_summary=metrics_summary, + snapshot=snapshot, + workloads=workloads, + ) + if structured: + history[hist_key].append(f"Atlas: {structured}") + history[hist_key] = history[hist_key][-80:] + send_msg(token, rid, structured) + continue context = build_context( body, allow_tools=allow_tools, From 373eb64c0d59f7a76410e70f9e0dbf2fa50ebeec Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 13:02:23 -0300 Subject: [PATCH 304/416] atlasbot: refine role and hardware filters --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7fdbf649..ce53f8cb 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-37 + checksum/atlasbot-configmap: manual-atlasbot-38 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index ff528ea0..a7741cda 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -432,7 +432,10 @@ def _detect_metric(q: str) -> str | None: def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: include: set[str] = set() exclude: set[str] = set() + rpi_specific = "rpi4" in q or "rpi5" in q for hardware, phrases in HARDWARE_HINTS.items(): + if hardware == "rpi" and rpi_specific: + continue for phrase in phrases: if f"non {phrase}" in q or f"non-{phrase}" in q or f"not {phrase}" in q: exclude.add(hardware) @@ -440,6 +443,17 @@ def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: include.add(hardware) return include, exclude + +def _detect_role_filters(q: str) -> set[str]: + roles: set[str] = set() + if "control-plane" in q or "control plane" in q: + roles.add("control-plane") + if "master" in q: + roles.add("master") + if "accelerator" in q: + roles.add("accelerator") + return roles + def _detect_entity(q: str) -> str | None: if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q): return "node" @@ -1125,6 +1139,7 @@ def structured_answer( include_hw, exclude_hw = _detect_hardware_filters(q) nodes_in_query = _extract_titan_nodes(q) only_workers = "worker" in q or "workers" in q + role_filters = _detect_role_filters(q) only_ready: bool | None = None if "not ready" in q or "unready" in q or "down" in q or "missing" in q: only_ready = False @@ -1201,6 +1216,12 @@ def structured_answer( only_ready=only_ready if op in ("status", "count") else None, nodes_in_query=nodes_in_query, ) + if role_filters: + filtered = [ + node + for node in filtered + if role_filters.intersection(set(node.get("roles") or [])) + ] names = [node["name"] for node in filtered] if op == "status": From 2c26ec4a6fd520b2977243622e00ab2ccf75d1a9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 13:13:20 -0300 Subject: [PATCH 305/416] atlasbot: fix metric detection and role counts --- services/comms/scripts/atlasbot/bot.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index a7741cda..739019c2 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -424,9 +424,14 @@ def _detect_operation(q: str) -> str | None: return None def _detect_metric(q: str) -> str | None: + tokens = set(_tokens(q)) for metric, phrases in METRIC_HINTS.items(): - if _has_any(q, phrases): - return metric + for phrase in phrases: + if " " in phrase: + if phrase in q: + return metric + elif phrase in tokens: + return metric return None def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: @@ -1249,7 +1254,7 @@ def structured_answer( if missing: msg += f" Missing: {', '.join(missing)}." return _format_confidence(msg, "high") - if not (include_hw or exclude_hw or nodes_in_query or only_workers): + if not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters): return _format_confidence(f"Atlas has {len(names)} nodes.", "high") return _format_confidence(f"Matching nodes: {len(names)}.", "high") From a8f12ac94348817fb2f8476e0607d38cf8494917 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 13:15:13 -0300 Subject: [PATCH 306/416] comms: roll atlasbot after script update --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index ce53f8cb..4e793476 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-38 + checksum/atlasbot-configmap: manual-atlasbot-39 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From f8a4febea9a816a5f75324480ab928fd8512c197 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 13:17:33 -0300 Subject: [PATCH 307/416] atlasbot: refine ready/pod counts --- services/comms/scripts/atlasbot/bot.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 739019c2..f7cfd824 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1102,6 +1102,15 @@ def snapshot_metric_answer( pending = metrics.get("pods_pending") failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") + if "pending" in q and pending is not None: + return _format_confidence(f"Pending pods: {pending:.0f}.", "high") + if "failed" in q and failed is not None: + return _format_confidence(f"Failed pods: {failed:.0f}.", "high") + if "succeeded" in q or "completed" in q: + if succeeded is not None: + return _format_confidence(f"Succeeded pods: {succeeded:.0f}.", "high") + if "running" in q and running is not None: + return _format_confidence(f"Running pods: {running:.0f}.", "high") parts = [] if running is not None: parts.append(f"running {running:.0f}") @@ -1254,6 +1263,10 @@ def structured_answer( if missing: msg += f" Missing: {', '.join(missing)}." return _format_confidence(msg, "high") + if only_ready is True: + return _format_confidence(f"Ready nodes: {len(names)}.", "high") + if only_ready is False: + return _format_confidence(f"Not ready nodes: {len(names)}.", "high") if not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters): return _format_confidence(f"Atlas has {len(names)} nodes.", "high") return _format_confidence(f"Matching nodes: {len(names)}.", "high") From 6351bfcdedd9183e6cb60f2615f2c83dd7cc6ee4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 13:18:01 -0300 Subject: [PATCH 308/416] comms: roll atlasbot after answer tweaks --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4e793476..9af766dc 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-39 + checksum/atlasbot-configmap: manual-atlasbot-40 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 369c0d27c51b9d315c9b16a07d52d060c60bdad7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 14:09:23 -0300 Subject: [PATCH 309/416] portal: allow longer atlasbot responses --- services/bstein-dev-home/backend-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 26c99e11..ba7d6f80 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -70,7 +70,7 @@ spec: - name: AI_ATLASBOT_ENDPOINT value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer - name: AI_ATLASBOT_TIMEOUT_SEC - value: "5" + value: "30" - name: AI_NODE_NAME valueFrom: fieldRef: From 9e4a5b7e6b6b11f1303d4cfee8a1ad00480693f8 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 17:12:03 +0000 Subject: [PATCH 310/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index a520991b..563b920e 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From f749e5f1f8ae2430071314eae2d78791e2d67cdd Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 17:12:07 +0000 Subject: [PATCH 311/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 563b920e..66d41e30 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From 41890e06ab702bb70f94141704d0ec21dea2fcad Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 14:38:05 -0300 Subject: [PATCH 312/416] atlasbot: ignore mentions and gate cluster context --- services/comms/scripts/atlasbot/bot.py | 193 +++++++++++++++++++------ 1 file changed, 146 insertions(+), 47 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f7cfd824..26fe7efc 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -121,6 +121,49 @@ METRIC_HINTS = { "pods": ("pods", "pod"), } +CLUSTER_HINT_WORDS = { + "atlas", + "titan", + "cluster", + "k8s", + "kubernetes", + "node", + "nodes", + "pod", + "pods", + "namespace", + "service", + "deployment", + "daemonset", + "statefulset", + "grafana", + "victoria", + "prometheus", + "ariadne", + "mailu", + "nextcloud", + "vaultwarden", + "firefly", + "wger", + "jellyfin", + "planka", + "budget", + "element", + "synapse", + "mas", + "comms", + "longhorn", + "harbor", + "jenkins", + "gitea", + "flux", + "keycloak", + "postgres", + "database", + "db", + "atlasbot", +} + _OLLAMA_LOCK = threading.Lock() HARDWARE_HINTS = { @@ -231,6 +274,18 @@ def is_mentioned(content: dict, body: str) -> bool: return False return any(isinstance(uid, str) and uid.lower() in MENTION_USER_IDS for uid in user_ids) +def _strip_bot_mention(text: str) -> str: + if not text: + return "" + if not MENTION_LOCALPARTS: + return text.strip() + names = [re.escape(name) for name in MENTION_LOCALPARTS if name] + if not names: + return text.strip() + pattern = r"^(?:\s*@?(?:" + "|".join(names) + r")(?::)?\s+)+" + cleaned = re.sub(pattern, "", text, flags=re.IGNORECASE).strip() + return cleaned or text.strip() + # Matrix HTTP helper. def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None): @@ -1780,33 +1835,38 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): if not prompt: self._write_json(400, {"error": "missing_prompt"}) return + cleaned = _strip_bot_mention(prompt) snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) - metrics_summary = snapshot_context(prompt, snapshot) - structured = structured_answer( - prompt, - inventory=inventory, - metrics_summary=metrics_summary, - snapshot=snapshot, - workloads=workloads, - ) - if structured: - self._write_json(200, {"answer": structured}) - return - context = build_context( - prompt, - allow_tools=False, - targets=[], - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - ) - metrics_context, _metrics_fallback = metrics_query_context(prompt, allow_tools=True) - if metrics_context: - context = (context + "\n\n" + metrics_context).strip() if context else metrics_context + cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) + metrics_summary = snapshot_context(cleaned, snapshot) if cluster_query else "" + if cluster_query: + structured = structured_answer( + cleaned, + inventory=inventory, + metrics_summary=metrics_summary, + snapshot=snapshot, + workloads=workloads, + ) + if structured: + self._write_json(200, {"answer": structured}) + return + context = "" + if cluster_query: + context = build_context( + cleaned, + allow_tools=False, + targets=[], + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + ) + metrics_context, _metrics_fallback = metrics_query_context(cleaned, allow_tools=True) + if metrics_context: + context = (context + "\n\n" + metrics_context).strip() if context else metrics_context fallback = "I don't have enough data to answer that." - answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback) + answer = ollama_reply(("http", "internal"), cleaned, context=context, fallback=fallback) self._write_json(200, {"answer": answer}) @@ -1920,6 +1980,37 @@ def _knowledge_intent(prompt: str) -> bool: ) +def _is_cluster_query( + prompt: str, + *, + inventory: list[dict[str, Any]] | None, + workloads: list[dict[str, Any]] | None, +) -> bool: + q = normalize_query(prompt) + if not q: + return False + if TITAN_NODE_RE.search(q): + return True + if any(word in q for word in CLUSTER_HINT_WORDS): + return True + for host_match in HOST_RE.finditer(q): + host = host_match.group(1).lower() + if host.endswith("bstein.dev"): + return True + tokens = set(_tokens(q)) + if workloads: + for entry in workloads: + if not isinstance(entry, dict): + continue + if tokens & _workload_tokens(entry): + return True + if inventory: + names = {node.get("name") for node in inventory if isinstance(node, dict)} + if tokens & {n for n in names if n}: + return True + return False + + def _inventory_summary(inventory: list[dict[str, Any]]) -> str: if not inventory: return "" @@ -1958,7 +2049,8 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " "Translate metrics into natural language instead of echoing raw label/value pairs. " - "Do not answer by only listing runbooks; summarize the cluster first and mention docs only if useful. " + "Do not answer by only listing runbooks; if the question is about Atlas/Othrys, summarize the cluster first and mention docs only if useful. " + "If the question is not about Atlas/Othrys and no cluster context is provided, answer using general knowledge and say when you are unsure. " "If the answer is not grounded in the provided context or tool data, say you do not know. " "End every response with a line: 'Confidence: high|medium|low'." ) @@ -2087,7 +2179,8 @@ def sync_loop(token: str, room_id: str): if not (is_dm or mentioned): continue - lower_body = body.lower() + cleaned_body = _strip_bot_mention(body) + lower_body = cleaned_body.lower() # Only do live cluster introspection in DMs; metrics can be answered when mentioned. allow_tools = is_dm @@ -2101,7 +2194,7 @@ def sync_loop(token: str, room_id: str): # Attempt to scope tools to the most likely workloads when hostnames are mentioned. targets: list[tuple[str, str]] = [] - for m in HOST_RE.finditer(body.lower()): + for m in HOST_RE.finditer(lower_body): host = m.group(1).lower() for ep in _HOST_INDEX.get(host, []): backend = ep.get("backend") or {} @@ -2111,39 +2204,45 @@ def sync_loop(token: str, room_id: str): targets.append((ns, str(w["name"]))) snapshot = _snapshot_state() - inventory = node_inventory_for_prompt(body) + inventory = node_inventory_for_prompt(cleaned_body) if not inventory: inventory = _snapshot_inventory(snapshot) workloads = _snapshot_workloads(snapshot) - metrics_summary = snapshot_context(body, snapshot) - structured = structured_answer( - body, - inventory=inventory, - metrics_summary=metrics_summary, - snapshot=snapshot, - workloads=workloads, - ) + cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) + metrics_summary = snapshot_context(cleaned_body, snapshot) if cluster_query else "" + structured = "" + if cluster_query: + structured = structured_answer( + cleaned_body, + inventory=inventory, + metrics_summary=metrics_summary, + snapshot=snapshot, + workloads=workloads, + ) if structured: history[hist_key].append(f"Atlas: {structured}") history[hist_key] = history[hist_key][-80:] send_msg(token, rid, structured) continue - context = build_context( - body, - allow_tools=allow_tools, - targets=targets, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - ) + context = "" + if cluster_query: + context = build_context( + cleaned_body, + allow_tools=allow_tools, + targets=targets, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + ) if allow_tools and promql: res = vm_query(promql, timeout=20) rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra - metrics_context, _metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics) - if metrics_context: - context = (context + "\n\n" + metrics_context).strip() if context else metrics_context + if cluster_query: + metrics_context, _metrics_fallback = metrics_query_context(cleaned_body, allow_tools=allow_metrics) + if metrics_context: + context = (context + "\n\n" + metrics_context).strip() if context else metrics_context fallback = "I don't have enough data to answer that." @@ -2151,7 +2250,7 @@ def sync_loop(token: str, room_id: str): token, rid, hist_key, - body, + cleaned_body, context=context, fallback=fallback, ) From 21ff16cd7b08ec810c83f1615bb810e270bdf317 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 14:38:15 -0300 Subject: [PATCH 313/416] comms: roll atlasbot for mention stripping --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 9af766dc..aa91fdf6 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-40 + checksum/atlasbot-configmap: manual-atlasbot-41 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 681a37a9aec86ea2169c657462c35402d6a1519c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 14:54:09 -0300 Subject: [PATCH 314/416] atlasbot: simplify cluster gating and context --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 197 ++++++++++++++++-------- 2 files changed, 133 insertions(+), 66 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index aa91fdf6..a2b0a3c2 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-41 + checksum/atlasbot-configmap: manual-atlasbot-42 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 26fe7efc..64097dab 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -65,6 +65,16 @@ STOPWORDS = { "help", "atlas", "othrys", + "system", + "systems", + "service", + "services", + "app", + "apps", + "platform", + "software", + "tool", + "tools", } METRIC_HINT_WORDS = { @@ -129,6 +139,8 @@ CLUSTER_HINT_WORDS = { "kubernetes", "node", "nodes", + "worker", + "workers", "pod", "pods", "namespace", @@ -162,6 +174,11 @@ CLUSTER_HINT_WORDS = { "database", "db", "atlasbot", + "jetson", + "rpi", + "raspberry", + "amd64", + "arm64", } _OLLAMA_LOCK = threading.Lock() @@ -1840,18 +1857,6 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) - metrics_summary = snapshot_context(cleaned, snapshot) if cluster_query else "" - if cluster_query: - structured = structured_answer( - cleaned, - inventory=inventory, - metrics_summary=metrics_summary, - snapshot=snapshot, - workloads=workloads, - ) - if structured: - self._write_json(200, {"answer": structured}) - return context = "" if cluster_query: context = build_context( @@ -1862,11 +1867,14 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot=snapshot, workloads=workloads, ) - metrics_context, _metrics_fallback = metrics_query_context(cleaned, allow_tools=True) - if metrics_context: - context = (context + "\n\n" + metrics_context).strip() if context else metrics_context fallback = "I don't have enough data to answer that." - answer = ollama_reply(("http", "internal"), cleaned, context=context, fallback=fallback) + answer = ollama_reply( + ("http", "internal"), + cleaned, + context=context, + fallback=fallback, + use_history=False, + ) self._write_json(200, {"answer": answer}) @@ -1897,6 +1905,15 @@ def build_context( if facts: parts.append(facts) + snapshot_json = snapshot_compact_context( + prompt, + snapshot, + inventory=inventory, + workloads=workloads, + ) + if snapshot_json: + parts.append(snapshot_json) + endpoints, edges = catalog_hints(prompt) if endpoints: parts.append(endpoints) @@ -1925,15 +1942,6 @@ def build_context( if flux_bad: parts.append("Flux (not ready):\n" + flux_bad) - p_l = (prompt or "").lower() - if any(w in p_l for w in METRIC_HINT_WORDS): - restarts = vm_top_restarts(1) - if restarts: - parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts) - snap = vm_cluster_snapshot() - if snap: - parts.append("VictoriaMetrics (cluster snapshot):\n" + snap) - return "\n\n".join([p for p in parts if p]).strip() @@ -1963,6 +1971,68 @@ def snapshot_context(prompt: str, snapshot: dict[str, Any] | None) -> str: parts.append(f"Snapshot: workload={match}.") return "\n".join(parts).strip() +def _compact_nodes_detail(snapshot: dict[str, Any]) -> list[dict[str, Any]]: + details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else [] + output: list[dict[str, Any]] = [] + for node in details: + if not isinstance(node, dict): + continue + name = node.get("name") + if not name: + continue + output.append( + { + "name": name, + "ready": node.get("ready"), + "hardware": node.get("hardware"), + "arch": node.get("arch"), + "roles": node.get("roles"), + "is_worker": node.get("is_worker"), + "os": node.get("os"), + "kernel": node.get("kernel"), + "kubelet": node.get("kubelet"), + "container_runtime": node.get("container_runtime"), + } + ) + return output + +def _compact_metrics(snapshot: dict[str, Any]) -> dict[str, Any]: + metrics = snapshot.get("metrics") if isinstance(snapshot.get("metrics"), dict) else {} + return { + "pods_running": metrics.get("pods_running"), + "pods_pending": metrics.get("pods_pending"), + "pods_failed": metrics.get("pods_failed"), + "pods_succeeded": metrics.get("pods_succeeded"), + "postgres_connections": metrics.get("postgres_connections"), + "hottest_nodes": metrics.get("hottest_nodes"), + "node_usage": metrics.get("node_usage"), + "top_restarts_1h": metrics.get("top_restarts_1h"), + } + +def snapshot_compact_context( + prompt: str, + snapshot: dict[str, Any] | None, + *, + inventory: list[dict[str, Any]] | None, + workloads: list[dict[str, Any]] | None, +) -> str: + if not snapshot: + return "" + compact = { + "collected_at": snapshot.get("collected_at"), + "nodes_summary": snapshot.get("nodes_summary"), + "expected_workers": expected_worker_nodes_from_metrics(), + "nodes_detail": _compact_nodes_detail(snapshot), + "workloads": _workloads_for_prompt(prompt, workloads or [], limit=40) if workloads else [], + "metrics": _compact_metrics(snapshot), + "flux": snapshot.get("flux"), + "errors": snapshot.get("errors"), + } + text = json.dumps(compact, ensure_ascii=False) + if len(text) > MAX_FACTS_CHARS: + text = text[: MAX_FACTS_CHARS - 3].rstrip() + "..." + return "Cluster snapshot (JSON):\n" + text + def _knowledge_intent(prompt: str) -> bool: q = normalize_query(prompt) @@ -1998,16 +2068,8 @@ def _is_cluster_query( if host.endswith("bstein.dev"): return True tokens = set(_tokens(q)) - if workloads: - for entry in workloads: - if not isinstance(entry, dict): - continue - if tokens & _workload_tokens(entry): - return True - if inventory: - names = {node.get("name") for node in inventory if isinstance(node, dict)} - if tokens & {n for n in names if n}: - return True + if _NAME_INDEX and tokens & _NAME_INDEX: + return True return False @@ -2037,7 +2099,7 @@ def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str: summary = "\n".join(parts).strip() return _format_confidence(summary, "medium") if summary else "" -def _ollama_call(hist_key, prompt: str, *, context: str) -> str: +def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = True) -> str: system = ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " @@ -2062,7 +2124,8 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: system_content += "\n\nContext (grounded):\n" + context[:MAX_CONTEXT_CHARS] messages: list[dict[str, str]] = [{"role": "system", "content": system_content}] - messages.extend(_history_to_messages(history[hist_key][-24:])) + if use_history: + messages.extend(_history_to_messages(history[hist_key][-24:])) messages.append({"role": "user", "content": prompt}) payload = {"model": MODEL, "messages": messages, "stream": False} @@ -2082,31 +2145,55 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: else: raw_reply = data.get("response") or data.get("reply") or data reply = _normalize_reply(raw_reply) or "I'm here to help." - history[hist_key].append(f"Atlas: {reply}") + if use_history: + history[hist_key].append(f"Atlas: {reply}") return reply finally: if lock: lock.release() -def ollama_reply(hist_key, prompt: str, *, context: str, fallback: str = "") -> str: +def ollama_reply( + hist_key, + prompt: str, + *, + context: str, + fallback: str = "", + use_history: bool = True, +) -> str: last_error = None for attempt in range(max(1, OLLAMA_RETRIES + 1)): try: - return _ollama_call(hist_key, prompt, context=context) + return _ollama_call(hist_key, prompt, context=context, use_history=use_history) except Exception as exc: # noqa: BLE001 last_error = exc time.sleep(min(4, 2 ** attempt)) if fallback: - history[hist_key].append(f"Atlas: {fallback}") + if use_history: + history[hist_key].append(f"Atlas: {fallback}") return fallback return "I don't have enough data to answer that." -def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, context: str, fallback: str) -> str: +def ollama_reply_with_thinking( + token: str, + room: str, + hist_key, + prompt: str, + *, + context: str, + fallback: str, + use_history: bool = True, +) -> str: result: dict[str, str] = {"reply": ""} done = threading.Event() def worker(): - result["reply"] = ollama_reply(hist_key, prompt, context=context, fallback=fallback) + result["reply"] = ollama_reply( + hist_key, + prompt, + context=context, + fallback=fallback, + use_history=use_history, + ) done.set() thread = threading.Thread(target=worker, daemon=True) @@ -2182,9 +2269,8 @@ def sync_loop(token: str, room_id: str): cleaned_body = _strip_bot_mention(body) lower_body = cleaned_body.lower() - # Only do live cluster introspection in DMs; metrics can be answered when mentioned. + # Only do live cluster introspection in DMs. allow_tools = is_dm - allow_metrics = is_dm or mentioned promql = "" if allow_tools: @@ -2209,21 +2295,6 @@ def sync_loop(token: str, room_id: str): inventory = _snapshot_inventory(snapshot) workloads = _snapshot_workloads(snapshot) cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) - metrics_summary = snapshot_context(cleaned_body, snapshot) if cluster_query else "" - structured = "" - if cluster_query: - structured = structured_answer( - cleaned_body, - inventory=inventory, - metrics_summary=metrics_summary, - snapshot=snapshot, - workloads=workloads, - ) - if structured: - history[hist_key].append(f"Atlas: {structured}") - history[hist_key] = history[hist_key][-80:] - send_msg(token, rid, structured) - continue context = "" if cluster_query: context = build_context( @@ -2239,11 +2310,6 @@ def sync_loop(token: str, room_id: str): rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra - if cluster_query: - metrics_context, _metrics_fallback = metrics_query_context(cleaned_body, allow_tools=allow_metrics) - if metrics_context: - context = (context + "\n\n" + metrics_context).strip() if context else metrics_context - fallback = "I don't have enough data to answer that." reply = ollama_reply_with_thinking( @@ -2253,6 +2319,7 @@ def sync_loop(token: str, room_id: str): cleaned_body, context=context, fallback=fallback, + use_history=cluster_query, ) send_msg(token, rid, reply) From 60195033f6741984c332a43a0378adb7ceafcc5b Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 17:58:07 +0000 Subject: [PATCH 315/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 66d41e30..04d7e825 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From c758f28be0faeaab1368ff6b247a9c587ef73590 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 17:58:11 +0000 Subject: [PATCH 316/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 04d7e825..bb9e5f09 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 62b7ea7dcbf207a5dc20bf7649ac662fcb7987cc Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:00:36 -0300 Subject: [PATCH 317/416] atlasbot: tighten cluster intent and snapshot framing --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index a2b0a3c2..d24cba2b 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-42 + checksum/atlasbot-configmap: manual-atlasbot-43 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 64097dab..bee72e91 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2104,6 +2104,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " "Use the provided context and facts as your source of truth. " + "If the context includes a cluster snapshot, treat the question as about the Atlas/Othrys cluster even if the prompt is ambiguous. " "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. " "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. " "Prefer exact repo paths and Kubernetes resource names when relevant. " From ef578456d0c518ce81e2b46315e2fa030c58824a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:04:10 -0300 Subject: [PATCH 318/416] atlasbot: force cluster intent in prompts --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d24cba2b..f4e7f7d1 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-43 + checksum/atlasbot-configmap: manual-atlasbot-44 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index bee72e91..4316fe03 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1868,9 +1868,12 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): workloads=workloads, ) fallback = "I don't have enough data to answer that." + llm_prompt = cleaned + if cluster_query: + llm_prompt = f"Atlas cluster question: {cleaned}" answer = ollama_reply( ("http", "internal"), - cleaned, + llm_prompt, context=context, fallback=fallback, use_history=False, @@ -2313,11 +2316,14 @@ def sync_loop(token: str, room_id: str): context = (context + "\n\n" + extra).strip() if context else extra fallback = "I don't have enough data to answer that." + llm_prompt = cleaned_body + if cluster_query: + llm_prompt = f"Atlas cluster question: {cleaned_body}" reply = ollama_reply_with_thinking( token, rid, hist_key, - cleaned_body, + llm_prompt, context=context, fallback=fallback, use_history=cluster_query, From 0fca01d9a1b14f2c78e889697ff53166a74169ea Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:07:28 -0300 Subject: [PATCH 319/416] atlasbot: strengthen cluster disambiguation --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index f4e7f7d1..de50c37d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-44 + checksum/atlasbot-configmap: manual-atlasbot-45 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 4316fe03..62304fa0 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1870,7 +1870,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): fallback = "I don't have enough data to answer that." llm_prompt = cleaned if cluster_query: - llm_prompt = f"Atlas cluster question: {cleaned}" + llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned}" answer = ollama_reply( ("http", "internal"), llm_prompt, @@ -2108,6 +2108,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru "Be helpful, direct, and concise. " "Use the provided context and facts as your source of truth. " "If the context includes a cluster snapshot, treat the question as about the Atlas/Othrys cluster even if the prompt is ambiguous. " + "When a cluster snapshot is provided, never answer about unrelated meanings of 'Atlas' (maps, mythology, Apache Atlas, etc). " "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. " "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. " "Prefer exact repo paths and Kubernetes resource names when relevant. " @@ -2318,7 +2319,7 @@ def sync_loop(token: str, room_id: str): llm_prompt = cleaned_body if cluster_query: - llm_prompt = f"Atlas cluster question: {cleaned_body}" + llm_prompt = f\"Atlas cluster question (use the cluster snapshot context): {cleaned_body}\" reply = ollama_reply_with_thinking( token, rid, From 41d185fad3a69cf101ee9e53ac5fcc236b526382 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:10:03 -0300 Subject: [PATCH 320/416] atlasbot: fix prompt formatting --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index de50c37d..d4d66684 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-45 + checksum/atlasbot-configmap: manual-atlasbot-46 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 62304fa0..429fa31d 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2319,7 +2319,7 @@ def sync_loop(token: str, room_id: str): llm_prompt = cleaned_body if cluster_query: - llm_prompt = f\"Atlas cluster question (use the cluster snapshot context): {cleaned_body}\" + llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned_body}" reply = ollama_reply_with_thinking( token, rid, From 0e26d249c68b8882db74ada48d606d60798070d0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:12:47 -0300 Subject: [PATCH 321/416] atlasbot: send snapshot as explicit context --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d4d66684..47d09920 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-46 + checksum/atlasbot-configmap: manual-atlasbot-47 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 429fa31d..351bb400 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2124,11 +2124,9 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru endpoint = _ollama_endpoint() if not endpoint: raise RuntimeError("ollama endpoint missing") - system_content = system + messages: list[dict[str, str]] = [{"role": "system", "content": system}] if context: - system_content += "\n\nContext (grounded):\n" + context[:MAX_CONTEXT_CHARS] - - messages: list[dict[str, str]] = [{"role": "system", "content": system_content}] + messages.append({"role": "user", "content": "Context (grounded):\n" + context[:MAX_CONTEXT_CHARS]}) if use_history: messages.extend(_history_to_messages(history[hist_key][-24:])) messages.append({"role": "user", "content": prompt}) From a8005bd13e75a16ca1595d47e14e8b58c88ddfe7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:30:43 -0300 Subject: [PATCH 322/416] atlasbot: answer cluster queries without llm --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 284 ++++++++++++++++++++++-- 2 files changed, 263 insertions(+), 23 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 47d09920..69b30e4d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-47 + checksum/atlasbot-configmap: manual-atlasbot-48 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 351bb400..f0bf008b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -532,7 +532,7 @@ def _detect_role_filters(q: str) -> set[str]: return roles def _detect_entity(q: str) -> str | None: - if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q): + if "node" in q or "nodes" in q or "worker" in q or "hardware" in q or "architecture" in q or TITAN_NODE_RE.search(q): return "node" if "pod" in q or "pods" in q: return "pod" @@ -1152,6 +1152,15 @@ def snapshot_metric_answer( if include_hw: scope = f" among {' and '.join(sorted(include_hw))}" answer = f"Hottest node{scope}: {node} ({value})." + if allowed_nodes and len(allowed_nodes) != len(inventory): + overall = _node_usage_top(usage, allowed_nodes=None) + if overall and overall[0] != node: + overall_val = _format_metric_value( + str(overall[1]), + percent=percent, + rate=metric in {"net", "io"}, + ) + answer += f" Overall hottest: {overall[0]} ({overall_val})." return _format_confidence(answer, "high") if metric == "connections" or "postgres" in q: @@ -1358,6 +1367,219 @@ def structured_answer( return "" + +def _nodes_summary_line(inventory: list[dict[str, Any]], snapshot: dict[str, Any] | None) -> str: + summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {} + nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {} + total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total") + ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready") + not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready") + if total is None: + total = len(inventory) + ready = len([n for n in inventory if n.get("ready") is True]) + not_ready = len([n for n in inventory if n.get("ready") is False]) + if total is None: + return "" + return f"Atlas cluster has {total} nodes ({ready} ready, {not_ready} not ready)." + + +def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str: + if not inventory: + return "" + groups = _group_nodes(inventory) + parts: list[str] = [] + for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"): + nodes = groups.get(key) or [] + if nodes: + parts.append(f"{key}={len(nodes)}") + if not parts: + return "" + return "Hardware mix: " + ", ".join(parts) + "." + + +def _os_mix_line(snapshot: dict[str, Any] | None) -> str: + if not snapshot: + return "" + details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else [] + counts: dict[str, int] = collections.Counter() + for node in details: + if not isinstance(node, dict): + continue + os_name = (node.get("os") or "").strip() + if os_name: + counts[os_name] += 1 + if not counts: + return "" + parts = [f"{os_name}={count}" for os_name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))] + return "OS mix: " + ", ".join(parts[:5]) + "." + + +def _pods_summary_line(metrics: dict[str, Any]) -> str: + if not metrics: + return "" + running = metrics.get("pods_running") + pending = metrics.get("pods_pending") + failed = metrics.get("pods_failed") + succeeded = metrics.get("pods_succeeded") + parts: list[str] = [] + if running is not None: + parts.append(f"{running:.0f} running") + if pending is not None: + parts.append(f"{pending:.0f} pending") + if failed is not None: + parts.append(f"{failed:.0f} failed") + if succeeded is not None: + parts.append(f"{succeeded:.0f} succeeded") + if not parts: + return "" + return "Pods: " + ", ".join(parts) + "." + + +def _postgres_summary_line(metrics: dict[str, Any]) -> str: + if not metrics: + return "" + postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {} + if not postgres: + return "" + used = postgres.get("used") + max_conn = postgres.get("max") + hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} + parts: list[str] = [] + if used is not None and max_conn is not None: + parts.append(f"{used:.0f}/{max_conn:.0f} connections") + if hottest.get("label"): + hot_val = hottest.get("value") + hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else "" + parts.append(f"hottest {hottest.get('label')} ({hot_val_str})") + if not parts: + return "" + return "Postgres: " + ", ".join(parts) + "." + + +def _hottest_summary_line(metrics: dict[str, Any]) -> str: + if not metrics: + return "" + hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + if not hottest: + return "" + parts: list[str] = [] + for key in ("cpu", "ram", "net", "io"): + entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} + node = entry.get("node") + value = entry.get("value") + if node and value is not None: + value_fmt = _format_metric_value( + str(value), + percent=key in ("cpu", "ram"), + rate=key in ("net", "io"), + ) + parts.append(f"{key.upper()} {node} ({value_fmt})") + if not parts: + return "" + return "Hottest nodes: " + "; ".join(parts) + "." + + +def cluster_overview_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, +) -> str: + if not inventory and not snapshot: + return "" + q = normalize_query(prompt) + metrics = _snapshot_metrics(snapshot) + lines: list[str] = [] + + nodes_line = _nodes_summary_line(inventory, snapshot) + if nodes_line: + lines.append(nodes_line) + + if any(word in q for word in ("hardware", "architecture", "nodes", "node", "cluster", "atlas", "titan", "lab")): + hw_line = _hardware_mix_line(inventory) + if hw_line: + lines.append(hw_line) + os_line = _os_mix_line(snapshot) + if os_line: + lines.append(os_line) + + if any( + word in q + for word in ( + "interesting", + "status", + "health", + "overview", + "summary", + "tell me", + "what do you know", + "about", + "pods", + "postgres", + "connections", + "hottest", + "cpu", + "ram", + "memory", + "net", + "network", + "io", + "disk", + "busy", + "load", + "usage", + "utilization", + ) + ): + pods_line = _pods_summary_line(metrics) + if pods_line: + lines.append(pods_line) + hottest_line = _hottest_summary_line(metrics) + if hottest_line: + lines.append(hottest_line) + postgres_line = _postgres_summary_line(metrics) + if postgres_line: + lines.append(postgres_line) + + if not lines: + return "" + return "Based on the snapshot, " + "\n".join(lines) + + +def cluster_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]] | None, +) -> str: + metrics_summary = snapshot_context(prompt, snapshot) + structured = structured_answer( + prompt, + inventory=inventory, + metrics_summary=metrics_summary, + snapshot=snapshot, + workloads=workloads, + ) + if structured: + return structured + + overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot) + if overview: + kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else "" + if kb_titles: + overview = overview + "\n" + kb_titles + return _format_confidence(overview, "medium") + + kb_titles = kb_retrieve_titles(prompt, limit=4) + if kb_titles: + return _format_confidence(kb_titles, "low") + + if metrics_summary: + return _format_confidence(metrics_summary, "low") + + return "" + def _metric_tokens(entry: dict[str, Any]) -> str: parts: list[str] = [] for key in ("panel_title", "dashboard", "description"): @@ -1868,16 +2090,24 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): workloads=workloads, ) fallback = "I don't have enough data to answer that." - llm_prompt = cleaned if cluster_query: - llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned}" - answer = ollama_reply( - ("http", "internal"), - llm_prompt, - context=context, - fallback=fallback, - use_history=False, - ) + answer = cluster_answer( + cleaned, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + ) + if not answer: + answer = fallback + else: + llm_prompt = cleaned + answer = ollama_reply( + ("http", "internal"), + llm_prompt, + context=context, + fallback=fallback, + use_history=False, + ) self._write_json(200, {"answer": answer}) @@ -2044,6 +2274,7 @@ def _knowledge_intent(prompt: str) -> bool: for phrase in ( "what do you know", "tell me about", + "interesting", "overview", "summary", "describe", @@ -2312,21 +2543,30 @@ def sync_loop(token: str, room_id: str): res = vm_query(promql, timeout=20) rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered - context = (context + "\n\n" + extra).strip() if context else extra + send_msg(token, rid, extra) + continue fallback = "I don't have enough data to answer that." - llm_prompt = cleaned_body if cluster_query: - llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned_body}" - reply = ollama_reply_with_thinking( - token, - rid, - hist_key, - llm_prompt, - context=context, - fallback=fallback, - use_history=cluster_query, - ) + reply = cluster_answer( + cleaned_body, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + ) + if not reply: + reply = fallback + else: + llm_prompt = cleaned_body + reply = ollama_reply_with_thinking( + token, + rid, + hist_key, + llm_prompt, + context=context, + fallback=fallback, + use_history=False, + ) send_msg(token, rid, reply) def login_with_retry(): From 6a8731582a4aa7db30f229fc75a6b65054207de9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:36:08 -0300 Subject: [PATCH 323/416] atlasbot: return structured cluster summaries --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 69b30e4d..06856266 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-48 + checksum/atlasbot-configmap: manual-atlasbot-49 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f0bf008b..e936b955 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1268,7 +1268,17 @@ def structured_answer( node_regex = "|".join([n["name"] for n in scoped]) expr = _apply_node_filter(expr, node_regex) res = vm_query(expr, timeout=20) - answer = _format_metric_answer(entry, res) + answer = "" + if op == "top" or "hottest" in (entry.get("panel_title") or "").lower(): + node, val = _primary_series_metric(res) + if node and val is not None: + percent = _metric_expr_uses_percent(entry) + value_fmt = _format_metric_value(val or "", percent=percent) + metric_label = (metric or "").upper() + label = f"{metric_label} node" if metric_label else "node" + answer = f"Hottest {label}: {node} ({value_fmt})." + if not answer: + answer = _format_metric_answer(entry, res) if answer: scope_parts: list[str] = [] if include_hw: @@ -1292,8 +1302,8 @@ def structured_answer( percent = _metric_expr_uses_percent(entry) base_val_fmt = _format_metric_value(base_val or "", percent=percent) overall_note = f" Overall hottest node: {base_node} ({base_val_fmt})." - return f"Among {scope} nodes, {answer}{overall_note}" - return answer + return _format_confidence(f"Among {scope} nodes, {answer}{overall_note}", "high") + return _format_confidence(answer, "high") if metrics_summary: return metrics_summary @@ -1408,7 +1418,7 @@ def _os_mix_line(snapshot: dict[str, Any] | None) -> str: os_name = (node.get("os") or "").strip() if os_name: counts[os_name] += 1 - if not counts: + if not counts or (len(counts) == 1 and "linux" in counts): return "" parts = [f"{os_name}={count}" for os_name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))] return "OS mix: " + ", ".join(parts[:5]) + "." From 8d467bc12f0ad30c38f2d213a786fe682b6cbad0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:42:31 -0300 Subject: [PATCH 324/416] atlasbot: improve workload matching and fallbacks --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 06856266..bccf752b 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-49 + checksum/atlasbot-configmap: manual-atlasbot-50 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e936b955..34e27cf9 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1031,6 +1031,12 @@ def _workload_tokens(entry: dict[str, Any]) -> set[str]: return tokens +def _workload_query_target(prompt: str) -> str: + tokens = set(_tokens(prompt)) + matches = sorted(tokens & _NAME_INDEX) if _NAME_INDEX else [] + return matches[0] if matches else "" + + def _select_workload(prompt: str, workloads: list[dict[str, Any]]) -> dict[str, Any] | None: q_tokens = set(_tokens(prompt)) if not q_tokens: @@ -1041,6 +1047,12 @@ def _select_workload(prompt: str, workloads: list[dict[str, Any]]) -> dict[str, continue tokens = _workload_tokens(entry) score = len(tokens & q_tokens) + name = (entry.get("workload") or "").lower() + namespace = (entry.get("namespace") or "").lower() + if name and name in q_tokens: + score += 5 + if namespace and namespace in q_tokens: + score += 3 if score: scored.append((score, entry)) if not scored: @@ -1574,6 +1586,14 @@ def cluster_answer( if structured: return structured + q = normalize_query(prompt) + workload_target = _workload_query_target(prompt) + if workload_target and any(word in q for word in ("where", "run", "running", "host", "node")): + return _format_confidence( + f"I don't have workload placement data for {workload_target} in the current snapshot.", + "low", + ) + overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot) if overview: kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else "" From d8657c551f3a17b65fcd0c1dc56f5199d8695ee7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 15:45:18 -0300 Subject: [PATCH 325/416] atlasbot: avoid namespace-only workload matches --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index bccf752b..301a4746 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-50 + checksum/atlasbot-configmap: manual-atlasbot-51 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 34e27cf9..d36844bc 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1071,11 +1071,17 @@ def workload_answer(prompt: str, workloads: list[dict[str, Any]]) -> str: q = normalize_query(prompt) if not any(word in q for word in ("where", "which", "node", "run", "running", "host", "located")): return "" + target = _workload_query_target(prompt) entry = _select_workload(prompt, workloads) if not entry: return "" workload = entry.get("workload") or "" namespace = entry.get("namespace") or "" + if target: + workload_l = str(workload).lower() + namespace_l = str(namespace).lower() + if workload_l != target and namespace_l == target and "namespace" not in q and "workload" not in q: + return "" nodes = entry.get("nodes") if isinstance(entry.get("nodes"), dict) else {} primary = entry.get("primary_node") or "" if not workload or not nodes: From f241189fab40ac3632d981d4b50524c011ad23d3 Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 18:57:30 +0000 Subject: [PATCH 326/416] chore(maintenance): automated image update --- services/maintenance/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index e4580aae..a1ca5831 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -26,7 +26,7 @@ resources: - image-sweeper-cronjob.yaml images: - name: registry.bstein.dev/bstein/ariadne - newTag: 0.1.0-58 # {"$imagepolicy": "maintenance:ariadne:tag"} + newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance From 0fbbbf39e9e221051bc9ac1ecf654a5ee50b79e7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:19:30 -0300 Subject: [PATCH 327/416] monitoring: fix jetson gpu metrics --- scripts/dashboards_render_atlas.py | 9 ++++++- services/monitoring/dashboards/atlas-gpu.json | 2 +- .../monitoring/grafana-dashboard-gpu.yaml | 2 +- .../jetson-tegrastats-exporter.yaml | 2 +- .../scripts/jetson_tegrastats_exporter.py | 25 +++++++++++++------ 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 675fec52..6ad43218 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -221,6 +221,13 @@ def jetson_gpu_util_by_node(): return 'max by (node) (jetson_gr3d_freq_percent{node!=""})' +def jetson_gpu_util_by_hostname(): + return ( + 'label_replace(max by (node) (jetson_gr3d_freq_percent{node!=""}), ' + '"Hostname", "$1", "node", "(.*)")' + ) + + def jetson_gpu_requests(scope_var): return ( "sum by (namespace,node) (" @@ -2688,7 +2695,7 @@ def build_gpu_dashboard(): timeseries_panel( 3, "GPU Util by Node", - 'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})', + f'(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{{pod!=""}})) or ({jetson_gpu_util_by_hostname()})', {"h": 8, "w": 12, "x": 0, "y": 8}, unit="percent", legend="{{Hostname}}", diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 6b76a5c2..36ab9e5f 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -126,7 +126,7 @@ }, "targets": [ { - "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 46b25cd0..bb395dbf 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -135,7 +135,7 @@ data: }, "targets": [ { - "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 8584ebaa..00743943 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "1" + monitoring.bstein.dev/restart-rev: "2" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index c237ec5d..3858d969 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -4,7 +4,7 @@ import re import socketserver import subprocess import threading -from time import time +from time import sleep, time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename @@ -20,6 +20,7 @@ METRICS = { LOCK = threading.Lock() def parse_line(line: str): + line = line.strip() updates = {} m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line) if m: @@ -34,7 +35,7 @@ def parse_line(line: str): if m: updates["ram_used_mb"] = float(m.group(1)) updates["ram_total_mb"] = float(m.group(2)) - m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line) + m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line) if m: updates["power_5v_in_mw"] = float(m.group(1)) with LOCK: @@ -42,15 +43,23 @@ def parse_line(line: str): METRICS["last_scrape_ts"] = time() def run_tegrastats(): - proc = subprocess.Popen( - ["/host/usr/bin/tegrastats", "--interval", "1000"], - stdout=subprocess.PIPE, + logfile = "/tmp/tegrastats.log" + subprocess.Popen( + ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", logfile], + stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, text=True, - bufsize=1, ) - for line in proc.stdout: - parse_line(line) + while not os.path.exists(logfile): + sleep(0.1) + with open(logfile, "r", encoding="utf-8", errors="ignore") as handle: + handle.seek(0, os.SEEK_END) + while True: + line = handle.readline() + if not line: + sleep(0.2) + continue + parse_line(line) class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): From eb809524b50e822232e1f067ac327cfd1122f168 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:23:23 -0300 Subject: [PATCH 328/416] monitoring: refresh jetson stats on scrape --- .../jetson-tegrastats-exporter.yaml | 2 +- .../scripts/jetson_tegrastats_exporter.py | 37 +++++++++++-------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 00743943..a6612c66 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "2" + monitoring.bstein.dev/restart-rev: "3" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index 3858d969..4cbf6ca3 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -4,10 +4,11 @@ import re import socketserver import subprocess import threading -from time import sleep, time +from time import time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename +LOGFILE = "/tmp/tegrastats.log" METRICS = { "gr3d_freq_percent": 0.0, "gpu_temp_c": 0.0, @@ -42,24 +43,28 @@ def parse_line(line: str): METRICS.update(updates) METRICS["last_scrape_ts"] = time() -def run_tegrastats(): - logfile = "/tmp/tegrastats.log" +def start_tegrastats(): subprocess.Popen( - ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", logfile], + ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", LOGFILE], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, text=True, ) - while not os.path.exists(logfile): - sleep(0.1) - with open(logfile, "r", encoding="utf-8", errors="ignore") as handle: - handle.seek(0, os.SEEK_END) - while True: - line = handle.readline() - if not line: - sleep(0.2) - continue - parse_line(line) + + +def refresh_from_log(): + if not os.path.exists(LOGFILE): + return + try: + with open(LOGFILE, "rb") as handle: + handle.seek(0, os.SEEK_END) + size = handle.tell() + handle.seek(max(size - 4096, 0), os.SEEK_SET) + tail = handle.read().decode("utf-8", errors="ignore").splitlines() + if tail: + parse_line(tail[-1]) + except OSError: + return class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): @@ -67,6 +72,7 @@ class Handler(http.server.BaseHTTPRequestHandler): self.send_response(404) self.end_headers() return + refresh_from_log() with LOCK: metrics = METRICS.copy() out = [] @@ -85,7 +91,6 @@ class Handler(http.server.BaseHTTPRequestHandler): return if __name__ == "__main__": - t = threading.Thread(target=run_tegrastats, daemon=True) - t.start() + start_tegrastats() with socketserver.TCPServer(("", PORT), Handler) as httpd: httpd.serve_forever() From 3b2029056162faea54d6ba9a33f2ba72204a5b28 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:27:45 -0300 Subject: [PATCH 329/416] monitoring: read jetson stats on demand --- .../jetson-tegrastats-exporter.yaml | 2 +- .../scripts/jetson_tegrastats_exporter.py | 27 +++++++++---------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index a6612c66..d80d83eb 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "3" + monitoring.bstein.dev/restart-rev: "4" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index 4cbf6ca3..204e439c 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -3,13 +3,12 @@ import os import re import socketserver import subprocess -import threading from time import time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename LOGFILE = "/tmp/tegrastats.log" -METRICS = { +BASE_METRICS = { "gr3d_freq_percent": 0.0, "gpu_temp_c": 0.0, "cpu_temp_c": 0.0, @@ -18,9 +17,8 @@ METRICS = { "power_5v_in_mw": 0.0, "last_scrape_ts": 0.0, } -LOCK = threading.Lock() -def parse_line(line: str): +def parse_line(line: str) -> dict: line = line.strip() updates = {} m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line) @@ -39,9 +37,7 @@ def parse_line(line: str): m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line) if m: updates["power_5v_in_mw"] = float(m.group(1)) - with LOCK: - METRICS.update(updates) - METRICS["last_scrape_ts"] = time() + return updates def start_tegrastats(): subprocess.Popen( @@ -52,19 +48,18 @@ def start_tegrastats(): ) -def refresh_from_log(): +def read_latest_line() -> str: if not os.path.exists(LOGFILE): - return + return "" try: with open(LOGFILE, "rb") as handle: handle.seek(0, os.SEEK_END) size = handle.tell() handle.seek(max(size - 4096, 0), os.SEEK_SET) tail = handle.read().decode("utf-8", errors="ignore").splitlines() - if tail: - parse_line(tail[-1]) + return tail[-1] if tail else "" except OSError: - return + return "" class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): @@ -72,9 +67,11 @@ class Handler(http.server.BaseHTTPRequestHandler): self.send_response(404) self.end_headers() return - refresh_from_log() - with LOCK: - metrics = METRICS.copy() + metrics = BASE_METRICS.copy() + line = read_latest_line() + if line: + metrics.update(parse_line(line)) + metrics["last_scrape_ts"] = time() out = [] label = f'{{node="{NODE_NAME}"}}' for k, v in metrics.items(): From aacfc8f28ca6c793111eb402bafa61ab5a05d245 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:34:31 -0300 Subject: [PATCH 330/416] monitoring: read tegrastats per scrape --- .../jetson-tegrastats-exporter.yaml | 2 +- .../scripts/jetson_tegrastats_exporter.py | 32 ++++++++----------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index d80d83eb..36799388 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "4" + monitoring.bstein.dev/restart-rev: "5" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index 204e439c..8314ad72 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -7,7 +7,6 @@ from time import time PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100")) NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename -LOGFILE = "/tmp/tegrastats.log" BASE_METRICS = { "gr3d_freq_percent": 0.0, "gpu_temp_c": 0.0, @@ -39,25 +38,21 @@ def parse_line(line: str) -> dict: updates["power_5v_in_mw"] = float(m.group(1)) return updates -def start_tegrastats(): - subprocess.Popen( - ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", LOGFILE], - stdout=subprocess.DEVNULL, - stderr=subprocess.STDOUT, - text=True, - ) - - def read_latest_line() -> str: - if not os.path.exists(LOGFILE): - return "" try: - with open(LOGFILE, "rb") as handle: - handle.seek(0, os.SEEK_END) - size = handle.tell() - handle.seek(max(size - 4096, 0), os.SEEK_SET) - tail = handle.read().decode("utf-8", errors="ignore").splitlines() - return tail[-1] if tail else "" + proc = subprocess.Popen( + ["/host/usr/bin/tegrastats", "--interval", "1000"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + line = proc.stdout.readline() + proc.terminate() + try: + proc.wait(timeout=1) + except subprocess.TimeoutExpired: + proc.kill() + return line except OSError: return "" @@ -88,6 +83,5 @@ class Handler(http.server.BaseHTTPRequestHandler): return if __name__ == "__main__": - start_tegrastats() with socketserver.TCPServer(("", PORT), Handler) as httpd: httpd.serve_forever() From 0a64708b3d3f54b535529da9f4d95927a5b3c419 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:38:09 -0300 Subject: [PATCH 331/416] monitoring: expose jetson scrape line length --- services/monitoring/jetson-tegrastats-exporter.yaml | 2 +- services/monitoring/scripts/jetson_tegrastats_exporter.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 36799388..6b0ce376 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "5" + monitoring.bstein.dev/restart-rev: "6" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index 8314ad72..284d5ce3 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -14,6 +14,7 @@ BASE_METRICS = { "ram_used_mb": 0.0, "ram_total_mb": 0.0, "power_5v_in_mw": 0.0, + "log_line_len": 0.0, "last_scrape_ts": 0.0, } @@ -33,7 +34,7 @@ def parse_line(line: str) -> dict: if m: updates["ram_used_mb"] = float(m.group(1)) updates["ram_total_mb"] = float(m.group(2)) - m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line) + m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)(?:mW)?/(\\d+)(?:mW)?", line) if m: updates["power_5v_in_mw"] = float(m.group(1)) return updates @@ -66,6 +67,7 @@ class Handler(http.server.BaseHTTPRequestHandler): line = read_latest_line() if line: metrics.update(parse_line(line)) + metrics["log_line_len"] = float(len(line)) metrics["last_scrape_ts"] = time() out = [] label = f'{{node="{NODE_NAME}"}}' From c0073b08ccaee0e14e54133914d4f58855202937 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 16:44:00 -0300 Subject: [PATCH 332/416] monitoring: fix tegrastats regexes --- services/monitoring/jetson-tegrastats-exporter.yaml | 2 +- .../monitoring/scripts/jetson_tegrastats_exporter.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml index 6b0ce376..ba25c9fd 100644 --- a/services/monitoring/jetson-tegrastats-exporter.yaml +++ b/services/monitoring/jetson-tegrastats-exporter.yaml @@ -17,7 +17,7 @@ spec: annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" - monitoring.bstein.dev/restart-rev: "6" + monitoring.bstein.dev/restart-rev: "7" spec: serviceAccountName: default hostPID: true diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py index 284d5ce3..8b361111 100644 --- a/services/monitoring/scripts/jetson_tegrastats_exporter.py +++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py @@ -21,20 +21,20 @@ BASE_METRICS = { def parse_line(line: str) -> dict: line = line.strip() updates = {} - m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line) + m = re.search(r"GR3D_FREQ\s+(\d+)%", line) if m: updates["gr3d_freq_percent"] = float(m.group(1)) - m = re.search(r"GPU@(\\d+(?:\\.\\d+)?)C", line) + m = re.search(r"GPU@(\d+(?:\.\d+)?)C", line) if m: updates["gpu_temp_c"] = float(m.group(1)) - m = re.search(r"CPU@(\\d+(?:\\.\\d+)?)C", line) + m = re.search(r"CPU@(\d+(?:\.\d+)?)C", line) if m: updates["cpu_temp_c"] = float(m.group(1)) - m = re.search(r"RAM\\s+(\\d+)/(\\d+)MB", line) + m = re.search(r"RAM\s+(\d+)/(\d+)MB", line) if m: updates["ram_used_mb"] = float(m.group(1)) updates["ram_total_mb"] = float(m.group(2)) - m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)(?:mW)?/(\\d+)(?:mW)?", line) + m = re.search(r"(?:POM_5V_IN|VDD_IN)\s+(\d+)(?:mW)?/(\d+)(?:mW)?", line) if m: updates["power_5v_in_mw"] = float(m.group(1)) return updates From 86cd5194ead988a7ecc30a96681432c594eb54f6 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 17:51:13 -0300 Subject: [PATCH 333/416] monitoring: fix gpu idle share --- scripts/dashboards_render_atlas.py | 2 +- services/monitoring/dashboards/atlas-gpu.json | 2 +- services/monitoring/dashboards/atlas-overview.json | 2 +- services/monitoring/grafana-dashboard-gpu.yaml | 2 +- services/monitoring/grafana-dashboard-overview.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 6ad43218..34ded89e 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -266,7 +266,7 @@ def namespace_gpu_share_expr(scope_var): usage = namespace_gpu_usage_instant(scope_var) total = f"(sum({usage}) or on() vector(0))" share = f"100 * ({usage}) / clamp_min({total}, 1)" - idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)" + idle = 'label_replace(vector(100), "namespace", "idle", "", "") * on() (' + total + " == bool 0)" return f"({share}) or ({idle})" diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 36ab9e5f..f6801aa6 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 04352f93..1a507ece 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1901,7 +1901,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index bb395dbf..dc1025b6 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 9495647f..ed63da05 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1910,7 +1910,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } From 0d4e1cac700d98c22f20697aaf187821f5a31418 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:08:19 -0300 Subject: [PATCH 334/416] atlasbot: make cluster answers more narrative --- services/comms/scripts/atlasbot/bot.py | 196 +++++++++++++++++++++---- 1 file changed, 165 insertions(+), 31 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index d36844bc..0dcfc606 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -181,6 +181,27 @@ CLUSTER_HINT_WORDS = { "arm64", } +_INSIGHT_HINT_WORDS = { + "interesting", + "unconventional", + "surprising", + "weird", + "odd", + "fun", + "cool", + "unique", + "notable", +} + +_OVERVIEW_HINT_WORDS = { + "overview", + "summary", + "describe", + "explain", + "tell me about", + "what do you know", +} + _OLLAMA_LOCK = threading.Lock() HARDWARE_HINTS = { @@ -1408,7 +1429,18 @@ def _nodes_summary_line(inventory: list[dict[str, Any]], snapshot: dict[str, Any not_ready = len([n for n in inventory if n.get("ready") is False]) if total is None: return "" - return f"Atlas cluster has {total} nodes ({ready} ready, {not_ready} not ready)." + if not_ready: + names = [] + summary_names = summary.get("not_ready_names") if isinstance(summary, dict) else [] + if isinstance(summary_names, list): + names = [name for name in summary_names if isinstance(name, str)] + if not names and snapshot: + details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else [] + names = [node.get("name") for node in details if isinstance(node, dict) and node.get("ready") is False] + names = [name for name in names if isinstance(name, str) and name] + suffix = f" (not ready: {', '.join(names)})" if names else "" + return f"Atlas has {total} nodes; {ready} ready, {not_ready} not ready{suffix}." + return f"Atlas has {total} nodes and all are Ready." def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str: @@ -1422,7 +1454,7 @@ def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str: parts.append(f"{key}={len(nodes)}") if not parts: return "" - return "Hardware mix: " + ", ".join(parts) + "." + return "Hardware mix includes " + ", ".join(parts) + "." def _os_mix_line(snapshot: dict[str, Any] | None) -> str: @@ -1449,6 +1481,8 @@ def _pods_summary_line(metrics: dict[str, Any]) -> str: pending = metrics.get("pods_pending") failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") + if running is None and pending is None and failed is None and succeeded is None: + return "" parts: list[str] = [] if running is not None: parts.append(f"{running:.0f} running") @@ -1458,9 +1492,7 @@ def _pods_summary_line(metrics: dict[str, Any]) -> str: parts.append(f"{failed:.0f} failed") if succeeded is not None: parts.append(f"{succeeded:.0f} succeeded") - if not parts: - return "" - return "Pods: " + ", ".join(parts) + "." + return "There are " + ", ".join(parts) + " pods." def _postgres_summary_line(metrics: dict[str, Any]) -> str: @@ -1481,7 +1513,7 @@ def _postgres_summary_line(metrics: dict[str, Any]) -> str: parts.append(f"hottest {hottest.get('label')} ({hot_val_str})") if not parts: return "" - return "Postgres: " + ", ".join(parts) + "." + return "Postgres is at " + ", ".join(parts) + "." def _hottest_summary_line(metrics: dict[str, Any]) -> str: @@ -1504,7 +1536,101 @@ def _hottest_summary_line(metrics: dict[str, Any]) -> str: parts.append(f"{key.upper()} {node} ({value_fmt})") if not parts: return "" - return "Hottest nodes: " + "; ".join(parts) + "." + return "Hot spots: " + "; ".join(parts) + "." + + +def _is_insight_query(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + if any(word in q for word in _INSIGHT_HINT_WORDS): + return True + if "most" in q and any(word in q for word in ("unusual", "odd", "weird", "unconventional")): + return True + return False + + +def _is_overview_query(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + return any(word in q for word in _OVERVIEW_HINT_WORDS) + + +def _doc_intent(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + return any( + phrase in q + for phrase in ( + "runbook", + "documentation", + "docs", + "guide", + "how do i", + "how to", + "instructions", + "playbook", + ) + ) + + +def _insight_candidates( + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, +) -> list[tuple[str, str, str]]: + metrics = _snapshot_metrics(snapshot) + candidates: list[tuple[str, str, str]] = [] + + nodes_line = _nodes_summary_line(inventory, snapshot) + if nodes_line and "not ready" in nodes_line.lower(): + candidates.append(("availability", nodes_line, "high")) + + hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + if hottest: + cpu = hottest.get("cpu") if isinstance(hottest.get("cpu"), dict) else {} + if cpu.get("node") and cpu.get("value") is not None: + value_fmt = _format_metric_value(str(cpu.get("value")), percent=True) + candidates.append(("cpu", f"The busiest CPU right now is {cpu.get('node')} at about {value_fmt}.", "high")) + ram = hottest.get("ram") if isinstance(hottest.get("ram"), dict) else {} + if ram.get("node") and ram.get("value") is not None: + value_fmt = _format_metric_value(str(ram.get("value")), percent=True) + candidates.append(("ram", f"RAM usage peaks on {ram.get('node')} at about {value_fmt}.", "high")) + + postgres_line = _postgres_summary_line(metrics) + if postgres_line: + candidates.append(("postgres", postgres_line, "high")) + + hardware_line = _hardware_mix_line(inventory) + if hardware_line: + candidates.append(("hardware", hardware_line, "medium")) + + pods_line = _pods_summary_line(metrics) + if pods_line: + candidates.append(("pods", pods_line, "high")) + + return candidates + + +def _select_insight( + prompt: str, + candidates: list[tuple[str, str, str]], +) -> tuple[str, str] | None: + if not candidates: + return None + q = normalize_query(prompt) + prefer_keys: list[str] = [] + if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): + prefer_keys.extend(["hardware", "availability"]) + if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: + return candidates[1][1], candidates[1][2] + if prefer_keys: + for key, text, conf in candidates: + if key in prefer_keys: + return text, conf + key, text, conf = candidates[0] + return text, conf def cluster_overview_answer( @@ -1517,31 +1643,21 @@ def cluster_overview_answer( return "" q = normalize_query(prompt) metrics = _snapshot_metrics(snapshot) - lines: list[str] = [] + sentences: list[str] = [] nodes_line = _nodes_summary_line(inventory, snapshot) if nodes_line: - lines.append(nodes_line) + sentences.append(nodes_line) - if any(word in q for word in ("hardware", "architecture", "nodes", "node", "cluster", "atlas", "titan", "lab")): - hw_line = _hardware_mix_line(inventory) - if hw_line: - lines.append(hw_line) - os_line = _os_mix_line(snapshot) - if os_line: - lines.append(os_line) - - if any( + wants_overview = _is_overview_query(q) or any(word in q for word in ("atlas", "cluster", "titan", "lab")) + wants_hardware = any(word in q for word in ("hardware", "architecture", "nodes", "node")) or wants_overview + wants_metrics = any( word in q for word in ( - "interesting", "status", "health", "overview", "summary", - "tell me", - "what do you know", - "about", "pods", "postgres", "connections", @@ -1558,20 +1674,32 @@ def cluster_overview_answer( "usage", "utilization", ) - ): + ) or wants_overview + + if wants_hardware: + hw_line = _hardware_mix_line(inventory) + if hw_line: + sentences.append(hw_line) + os_line = _os_mix_line(snapshot) + if os_line: + sentences.append(os_line) + + if wants_metrics: pods_line = _pods_summary_line(metrics) if pods_line: - lines.append(pods_line) - hottest_line = _hottest_summary_line(metrics) - if hottest_line: - lines.append(hottest_line) + sentences.append(pods_line) postgres_line = _postgres_summary_line(metrics) if postgres_line: - lines.append(postgres_line) + sentences.append(postgres_line) + hottest_line = _hottest_summary_line(metrics) + if hottest_line: + sentences.append(hottest_line) - if not lines: + if not sentences: return "" - return "Based on the snapshot, " + "\n".join(lines) + if len(sentences) > 3 and not wants_overview: + sentences = sentences[:3] + return "Based on the latest snapshot, " + " ".join(sentences) def cluster_answer( @@ -1582,6 +1710,12 @@ def cluster_answer( workloads: list[dict[str, Any]] | None, ) -> str: metrics_summary = snapshot_context(prompt, snapshot) + if _is_insight_query(prompt): + candidates = _insight_candidates(inventory, snapshot) + selected = _select_insight(prompt, candidates) + if selected: + text, confidence = selected + return _format_confidence(text, confidence) structured = structured_answer( prompt, inventory=inventory, @@ -1602,7 +1736,7 @@ def cluster_answer( overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot) if overview: - kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else "" + kb_titles = kb_retrieve_titles(prompt, limit=4) if _doc_intent(prompt) else "" if kb_titles: overview = overview + "\n" + kb_titles return _format_confidence(overview, "medium") From 59979a48e52a8553f16e0e8385e98206105c766e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:10:30 -0300 Subject: [PATCH 335/416] comms: roll atlasbot after bot updates --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 301a4746..817e9361 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-51 + checksum/atlasbot-configmap: manual-atlasbot-52 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From a39440e8721d07231e116bb01fe6bf6f575804dd Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 21:11:24 +0000 Subject: [PATCH 336/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index bb9e5f09..68eea2cb 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -22,7 +22,7 @@ images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend - newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} + newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: - name: chat-ai-gateway namespace: bstein-dev-home From 7d94896bafa96303df61039b01272b4508ae87af Mon Sep 17 00:00:00 2001 From: flux-bot Date: Tue, 27 Jan 2026 21:11:27 +0000 Subject: [PATCH 337/416] chore(bstein-dev-home): automated image update --- services/bstein-dev-home/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index 68eea2cb..a8132417 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -20,7 +20,7 @@ resources: - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend - newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} + newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"} - name: registry.bstein.dev/bstein/bstein-dev-home-backend newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"} configMapGenerator: From e05b627c71a97b8ec96c3bf50505758631ede33c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:17:29 -0300 Subject: [PATCH 338/416] atlasbot: add narrative insights --- services/comms/scripts/atlasbot/bot.py | 50 ++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 0dcfc606..ada8dd7f 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1616,7 +1616,7 @@ def _insight_candidates( def _select_insight( prompt: str, candidates: list[tuple[str, str, str]], -) -> tuple[str, str] | None: +) -> tuple[str, str, str] | None: if not candidates: return None q = normalize_query(prompt) @@ -1624,13 +1624,43 @@ def _select_insight( if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): prefer_keys.extend(["hardware", "availability"]) if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: - return candidates[1][1], candidates[1][2] + return candidates[1] if prefer_keys: for key, text, conf in candidates: if key in prefer_keys: - return text, conf - key, text, conf = candidates[0] - return text, conf + return key, text, conf + return candidates[0] + + +def _format_insight_text(key: str, text: str) -> str: + cleaned = text.strip().rstrip(".") + if not cleaned: + return "" + if key == "hardware": + counts = cleaned.replace("Hardware mix includes ", "") + return f"Atlas mixes Raspberry Pi, Jetson, and AMD64 nodes ({counts})." + if key == "postgres": + detail = cleaned.replace("Postgres is at ", "") + return f"Postgres looks healthy at {detail}." + if key == "pods": + detail = cleaned.replace("There are ", "") + return f"Pods look stable with {detail}." + if key == "availability": + return cleaned + "." + if key in ("cpu", "ram"): + return cleaned + "." + return cleaned + "." + + +def _insight_prefix(prompt: str) -> str: + q = normalize_query(prompt) + if any(word in q for word in ("another", "else", "different", "other")): + return "Another interesting detail: " + if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): + return "What stands out is that " + if any(word in q for word in ("interesting", "notable", "fun", "cool")): + return "One notable detail: " + return "" def cluster_overview_answer( @@ -1714,8 +1744,14 @@ def cluster_answer( candidates = _insight_candidates(inventory, snapshot) selected = _select_insight(prompt, candidates) if selected: - text, confidence = selected - return _format_confidence(text, confidence) + key, raw_text, confidence = selected + formatted = _format_insight_text(key, raw_text) + if not formatted: + formatted = raw_text + prefix = _insight_prefix(prompt) + if prefix: + formatted = prefix + formatted + return _format_confidence(formatted, confidence) structured = structured_answer( prompt, inventory=inventory, From 4589c65c2ba5ab57721998b7f8ece2fec9a0121d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:18:06 -0300 Subject: [PATCH 339/416] comms: roll atlasbot for insight updates --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 817e9361..31e37332 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-52 + checksum/atlasbot-configmap: manual-atlasbot-53 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 66a42aaa9323f8bb87c75428d6a1827b8c999a17 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:32:27 -0300 Subject: [PATCH 340/416] atlasbot: use history for subjective follow-ups --- services/comms/scripts/atlasbot/bot.py | 94 ++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 5 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index ada8dd7f..a446a10e 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -191,6 +191,10 @@ _INSIGHT_HINT_WORDS = { "cool", "unique", "notable", + "coolest", + "favorite", + "favourite", + "trivia", } _OVERVIEW_HINT_WORDS = { @@ -1550,6 +1554,21 @@ def _is_insight_query(query: str) -> bool: return False +def _is_subjective_query(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + return any(word in q for word in _INSIGHT_HINT_WORDS) or any( + phrase in q + for phrase in ( + "what do you think", + "your favorite", + "your favourite", + "your opinion", + ) + ) + + def _is_overview_query(query: str) -> bool: q = normalize_query(query) if not q: @@ -1602,9 +1621,9 @@ def _insight_candidates( if postgres_line: candidates.append(("postgres", postgres_line, "high")) - hardware_line = _hardware_mix_line(inventory) - if hardware_line: - candidates.append(("hardware", hardware_line, "medium")) + hardware_insight = _hardware_insight(inventory) + if hardware_insight: + candidates.append(("hardware", hardware_insight, "medium")) pods_line = _pods_summary_line(metrics) if pods_line: @@ -1613,6 +1632,29 @@ def _insight_candidates( return candidates +def _hardware_insight(inventory: list[dict[str, Any]]) -> str: + if not inventory: + return "" + groups = _group_nodes(inventory) + jetsons = groups.get("jetson") or [] + rpi5 = groups.get("rpi5") or [] + rpi4 = groups.get("rpi4") or [] + amd64 = groups.get("amd64") or [] + if jetsons: + jetson_names = ", ".join(jetsons[:2]) + return ( + f"Atlas mixes tiny Raspberry Pi nodes with Jetson accelerators ({jetson_names}) " + f"and AMD64 servers, which is unusual for a homelab cluster." + ) + if amd64 and (rpi5 or rpi4): + return ( + "Atlas mixes small ARM boards with a couple of AMD64 machines, " + "so workloads can land on either low-power or high-power nodes." + ) + line = _hardware_mix_line(inventory) + return line.replace("Hardware mix includes ", "Atlas mixes ") if line else "" + + def _select_insight( prompt: str, candidates: list[tuple[str, str, str]], @@ -1623,6 +1665,8 @@ def _select_insight( prefer_keys: list[str] = [] if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): prefer_keys.extend(["hardware", "availability"]) + if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")): + prefer_keys.extend(["hardware", "cpu", "ram"]) if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: return candidates[1] if prefer_keys: @@ -2284,7 +2328,24 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) - cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) + history_payload = payload.get("history") or [] + history_lines: list[str] = [] + if isinstance(history_payload, list): + for item in history_payload[-10:]: + if isinstance(item, dict): + content = item.get("content") or item.get("message") or "" + if isinstance(content, str) and content.strip(): + history_lines.append(content.strip()) + elif isinstance(item, str) and item.strip(): + history_lines.append(item.strip()) + history_cluster = _history_mentions_cluster( + history_lines, + inventory=inventory, + workloads=workloads, + ) + cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) or ( + _is_subjective_query(cleaned) and history_cluster + ) context = "" if cluster_query: context = build_context( @@ -2329,6 +2390,22 @@ history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] ( def key_for(room_id: str, sender: str, is_dm: bool): return (room_id, None) if is_dm else (room_id, sender) + +def _history_mentions_cluster( + history_lines: list[str], + *, + inventory: list[dict[str, Any]] | None = None, + workloads: list[dict[str, Any]] | None = None, +) -> bool: + recent = [line for line in history_lines[-8:] if isinstance(line, str)] + for line in recent: + cleaned = normalize_query(line) + if not cleaned: + continue + if _is_cluster_query(cleaned, inventory=inventory, workloads=workloads): + return True + return False + def build_context( prompt: str, *, @@ -2734,7 +2811,14 @@ def sync_loop(token: str, room_id: str): if not inventory: inventory = _snapshot_inventory(snapshot) workloads = _snapshot_workloads(snapshot) - cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) + history_cluster = _history_mentions_cluster( + history[hist_key], + inventory=inventory, + workloads=workloads, + ) + cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) or ( + _is_subjective_query(cleaned_body) and history_cluster + ) context = "" if cluster_query: context = build_context( From 9dbea9dd0b99be379f97bdfb2211308ecc1f5048 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:32:54 -0300 Subject: [PATCH 341/416] comms: roll atlasbot after history update --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 31e37332..03e9dc23 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-53 + checksum/atlasbot-configmap: manual-atlasbot-54 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 9ef1cdc7a94ab0a8b7d2c9dfcf85d1d37fc4fb7f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:43:03 -0300 Subject: [PATCH 342/416] atlasbot: improve insight voice and avoid repeats --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 86 ++++++++++++++++++++----- 2 files changed, 70 insertions(+), 18 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 03e9dc23..dc1b0bbe 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-54 + checksum/atlasbot-configmap: manual-atlasbot-55 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index a446a10e..2616cb1b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1640,27 +1640,49 @@ def _hardware_insight(inventory: list[dict[str, Any]]) -> str: rpi5 = groups.get("rpi5") or [] rpi4 = groups.get("rpi4") or [] amd64 = groups.get("amd64") or [] + parts: list[str] = [] + if rpi5: + parts.append(f"rpi5={len(rpi5)}") + if rpi4: + parts.append(f"rpi4={len(rpi4)}") if jetsons: jetson_names = ", ".join(jetsons[:2]) - return ( - f"Atlas mixes tiny Raspberry Pi nodes with Jetson accelerators ({jetson_names}) " - f"and AMD64 servers, which is unusual for a homelab cluster." - ) - if amd64 and (rpi5 or rpi4): - return ( - "Atlas mixes small ARM boards with a couple of AMD64 machines, " - "so workloads can land on either low-power or high-power nodes." - ) - line = _hardware_mix_line(inventory) - return line.replace("Hardware mix includes ", "Atlas mixes ") if line else "" + parts.append(f"jetson={len(jetsons)} ({jetson_names})") + if amd64: + parts.append(f"amd64={len(amd64)}") + return ", ".join(parts) + + +def _recent_insight_keys(history_lines: list[str]) -> set[str]: + used: set[str] = set() + for line in history_lines[-10:]: + lower = normalize_query(line) + if not lower: + continue + if "postgres" in lower or "connections" in lower: + used.add("postgres") + if "atlas mixes" in lower or "hardware" in lower or "rpi" in lower or "jetson" in lower: + used.add("hardware") + if "busiest cpu" in lower or "cpu right now" in lower or "cpu " in lower: + used.add("cpu") + if "ram usage" in lower or "memory" in lower: + used.add("ram") + if "pods" in lower: + used.add("pods") + if "not ready" in lower: + used.add("availability") + return used def _select_insight( prompt: str, candidates: list[tuple[str, str, str]], + *, + used_keys: set[str] | None = None, ) -> tuple[str, str, str] | None: if not candidates: return None + used = used_keys or set() q = normalize_query(prompt) prefer_keys: list[str] = [] if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): @@ -1668,11 +1690,21 @@ def _select_insight( if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")): prefer_keys.extend(["hardware", "cpu", "ram"]) if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: + for candidate in candidates: + if candidate[0] not in used: + return candidate return candidates[1] if prefer_keys: + for key, text, conf in candidates: + if key in prefer_keys and key not in used: + return key, text, conf for key, text, conf in candidates: if key in prefer_keys: return key, text, conf + if used: + for candidate in candidates: + if candidate[0] not in used: + return candidate return candidates[0] @@ -1681,29 +1713,45 @@ def _format_insight_text(key: str, text: str) -> str: if not cleaned: return "" if key == "hardware": - counts = cleaned.replace("Hardware mix includes ", "") - return f"Atlas mixes Raspberry Pi, Jetson, and AMD64 nodes ({counts})." + counts = ( + cleaned.replace("Hardware mix includes ", "") + .replace("Atlas mixes tiny ", "") + .replace("Atlas mixes ", "") + .replace("which is unusual for a homelab cluster", "") + .strip() + .strip(".") + ) + return f"the mixed hardware stack ({counts}) is a bit unconventional for a homelab." if key == "postgres": detail = cleaned.replace("Postgres is at ", "") - return f"Postgres looks healthy at {detail}." + return f"Postgres looks healthy at {detail}; that suggests moderate load." if key == "pods": detail = cleaned.replace("There are ", "") return f"Pods look stable with {detail}." if key == "availability": return cleaned + "." if key in ("cpu", "ram"): - return cleaned + "." + suffix = " That likely marks the busiest workload right now." if key == "cpu" else " That box is carrying the heaviest memory load." + return cleaned + "." + suffix return cleaned + "." def _insight_prefix(prompt: str) -> str: q = normalize_query(prompt) + if "coolest" in q: + return "If I had to pick the coolest detail, it's " + if "favorite" in q or "favourite" in q: + return "My favorite detail is " + if "trivia" in q: + return "A bit of trivia I like: " + if "most interesting" in q: + return "The most interesting detail to me is " if any(word in q for word in ("another", "else", "different", "other")): return "Another interesting detail: " if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): return "What stands out is that " if any(word in q for word in ("interesting", "notable", "fun", "cool")): - return "One notable detail: " + return "One thing I'd highlight is " return "" @@ -1782,11 +1830,13 @@ def cluster_answer( inventory: list[dict[str, Any]], snapshot: dict[str, Any] | None, workloads: list[dict[str, Any]] | None, + history_lines: list[str] | None = None, ) -> str: metrics_summary = snapshot_context(prompt, snapshot) if _is_insight_query(prompt): candidates = _insight_candidates(inventory, snapshot) - selected = _select_insight(prompt, candidates) + used_keys = _recent_insight_keys(history_lines or []) + selected = _select_insight(prompt, candidates, used_keys=used_keys) if selected: key, raw_text, confidence = selected formatted = _format_insight_text(key, raw_text) @@ -2363,6 +2413,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): inventory=inventory, snapshot=snapshot, workloads=workloads, + history_lines=history_lines, ) if not answer: answer = fallback @@ -2843,6 +2894,7 @@ def sync_loop(token: str, room_id: str): inventory=inventory, snapshot=snapshot, workloads=workloads, + history_lines=history[hist_key], ) if not reply: reply = fallback From 577e2a158d394ae1c3d60ad3426f00fdc86720aa Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:44:58 -0300 Subject: [PATCH 343/416] monitoring: keep idle label in gpu share --- scripts/dashboards_render_atlas.py | 2 +- services/monitoring/dashboards/atlas-gpu.json | 2 +- services/monitoring/dashboards/atlas-overview.json | 2 +- services/monitoring/grafana-dashboard-gpu.yaml | 2 +- services/monitoring/grafana-dashboard-overview.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 34ded89e..445de94b 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -266,7 +266,7 @@ def namespace_gpu_share_expr(scope_var): usage = namespace_gpu_usage_instant(scope_var) total = f"(sum({usage}) or on() vector(0))" share = f"100 * ({usage}) / clamp_min({total}, 1)" - idle = 'label_replace(vector(100), "namespace", "idle", "", "") * on() (' + total + " == bool 0)" + idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)" return f"({share}) or ({idle})" diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index f6801aa6..132f2766 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 1a507ece..b212c8cd 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1901,7 +1901,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index dc1025b6..55f63e84 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index ed63da05..a8990024 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1910,7 +1910,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } From ff5cfd27a1837f462c7e78d5abdaa71041387052 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:45:49 -0300 Subject: [PATCH 344/416] atlasbot: tighten insight phrasing --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index dc1b0bbe..4a3949da 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-55 + checksum/atlasbot-configmap: manual-atlasbot-56 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 2616cb1b..9beff7f6 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1721,17 +1721,21 @@ def _format_insight_text(key: str, text: str) -> str: .strip() .strip(".") ) - return f"the mixed hardware stack ({counts}) is a bit unconventional for a homelab." + return f"mixed hardware stack ({counts}), which is unusual for a homelab." if key == "postgres": detail = cleaned.replace("Postgres is at ", "") - return f"Postgres looks healthy at {detail}; that suggests moderate load." + return f"Postgres is at {detail}; that suggests moderate load." if key == "pods": detail = cleaned.replace("There are ", "") return f"Pods look stable with {detail}." if key == "availability": return cleaned + "." if key in ("cpu", "ram"): - suffix = " That likely marks the busiest workload right now." if key == "cpu" else " That box is carrying the heaviest memory load." + suffix = ( + " That likely marks the busiest workload right now." + if key == "cpu" + else " That box is carrying the heaviest memory load." + ) return cleaned + "." + suffix return cleaned + "." @@ -1739,19 +1743,19 @@ def _format_insight_text(key: str, text: str) -> str: def _insight_prefix(prompt: str) -> str: q = normalize_query(prompt) if "coolest" in q: - return "If I had to pick the coolest detail, it's " + return "If I had to pick the coolest detail: " if "favorite" in q or "favourite" in q: - return "My favorite detail is " + return "My favorite detail: " if "trivia" in q: return "A bit of trivia I like: " if "most interesting" in q: - return "The most interesting detail to me is " + return "The most interesting detail to me: " if any(word in q for word in ("another", "else", "different", "other")): return "Another interesting detail: " if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): return "What stands out is that " if any(word in q for word in ("interesting", "notable", "fun", "cool")): - return "One thing I'd highlight is " + return "One thing I'd highlight: " return "" From c02973e5a6aecf63a70051a6324182761b11b61d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:48:35 -0300 Subject: [PATCH 345/416] atlasbot: add more opinionated hardware insight --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4a3949da..d02255e3 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-56 + checksum/atlasbot-configmap: manual-atlasbot-57 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 9beff7f6..54434e71 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1721,7 +1721,9 @@ def _format_insight_text(key: str, text: str) -> str: .strip() .strip(".") ) - return f"mixed hardware stack ({counts}), which is unusual for a homelab." + detail = f"mixed hardware stack ({counts})" + flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 nodes." + return f"{detail}. {flavor}" if key == "postgres": detail = cleaned.replace("Postgres is at ", "") return f"Postgres is at {detail}; that suggests moderate load." @@ -1732,9 +1734,9 @@ def _format_insight_text(key: str, text: str) -> str: return cleaned + "." if key in ("cpu", "ram"): suffix = ( - " That likely marks the busiest workload right now." + " If you're chasing hotspots, that's the busiest workload right now." if key == "cpu" - else " That box is carrying the heaviest memory load." + else " That box is carrying the heaviest memory load right now." ) return cleaned + "." + suffix return cleaned + "." From 5553871d3351f81b7d27d65f63a598aa75eb1eb9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:51:00 -0300 Subject: [PATCH 346/416] atlasbot: make insights sound more human --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d02255e3..2c0b84d6 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-57 + checksum/atlasbot-configmap: manual-atlasbot-58 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 54434e71..659ea495 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1729,9 +1729,9 @@ def _format_insight_text(key: str, text: str) -> str: return f"Postgres is at {detail}; that suggests moderate load." if key == "pods": detail = cleaned.replace("There are ", "") - return f"Pods look stable with {detail}." + return f"Pods look steady ({detail}); the workload mix looks healthy." if key == "availability": - return cleaned + "." + return cleaned + " That suggests the cluster is stable right now." if key in ("cpu", "ram"): suffix = ( " If you're chasing hotspots, that's the busiest workload right now." From f175273a6cf947fc112c43020f5a2c096e2c9fb2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:54:05 -0300 Subject: [PATCH 347/416] atlasbot: use hottest node labels for insights --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 2c0b84d6..1212505c 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-58 + checksum/atlasbot-configmap: manual-atlasbot-59 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 659ea495..7f92d8ec 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1608,14 +1608,26 @@ def _insight_candidates( hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} if hottest: + def _hot_node(entry: dict[str, Any]) -> str: + if not isinstance(entry, dict): + return "" + return ( + entry.get("node") + or entry.get("label") + or (entry.get("metric") or {}).get("node") + or "" + ) + cpu = hottest.get("cpu") if isinstance(hottest.get("cpu"), dict) else {} - if cpu.get("node") and cpu.get("value") is not None: + cpu_node = _hot_node(cpu) + if cpu_node and cpu.get("value") is not None: value_fmt = _format_metric_value(str(cpu.get("value")), percent=True) - candidates.append(("cpu", f"The busiest CPU right now is {cpu.get('node')} at about {value_fmt}.", "high")) + candidates.append(("cpu", f"The busiest CPU right now is {cpu_node} at about {value_fmt}.", "high")) ram = hottest.get("ram") if isinstance(hottest.get("ram"), dict) else {} - if ram.get("node") and ram.get("value") is not None: + ram_node = _hot_node(ram) + if ram_node and ram.get("value") is not None: value_fmt = _format_metric_value(str(ram.get("value")), percent=True) - candidates.append(("ram", f"RAM usage peaks on {ram.get('node')} at about {value_fmt}.", "high")) + candidates.append(("ram", f"RAM usage peaks on {ram_node} at about {value_fmt}.", "high")) postgres_line = _postgres_summary_line(metrics) if postgres_line: From 39fb7e5eb47bb3be71cc78124df981f9ac6c3932 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:56:14 -0300 Subject: [PATCH 348/416] atlasbot: prioritize hardware for subjective prompts --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 1212505c..cbc79e54 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-59 + checksum/atlasbot-configmap: manual-atlasbot-60 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7f92d8ec..613b0c6f 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1707,12 +1707,14 @@ def _select_insight( return candidate return candidates[1] if prefer_keys: - for key, text, conf in candidates: - if key in prefer_keys and key not in used: - return key, text, conf - for key, text, conf in candidates: - if key in prefer_keys: - return key, text, conf + for prefer in prefer_keys: + for key, text, conf in candidates: + if key == prefer and key not in used: + return key, text, conf + for prefer in prefer_keys: + for key, text, conf in candidates: + if key == prefer: + return key, text, conf if used: for candidate in candidates: if candidate[0] not in used: From d35cb0c6c39e9e6cb1489556a678b329169afcff Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 18:58:59 -0300 Subject: [PATCH 349/416] atlasbot: keep coolest answers opinionated --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index cbc79e54..ef6b88b6 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-60 + checksum/atlasbot-configmap: manual-atlasbot-61 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 613b0c6f..9434e913 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1701,6 +1701,7 @@ def _select_insight( prefer_keys.extend(["hardware", "availability"]) if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")): prefer_keys.extend(["hardware", "cpu", "ram"]) + avoid_used = any(word in q for word in ("another", "else", "different", "other")) or "most interesting" in q if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: for candidate in candidates: if candidate[0] not in used: @@ -1709,13 +1710,13 @@ def _select_insight( if prefer_keys: for prefer in prefer_keys: for key, text, conf in candidates: - if key == prefer and key not in used: + if key == prefer and (not avoid_used or key not in used): return key, text, conf for prefer in prefer_keys: for key, text, conf in candidates: if key == prefer: return key, text, conf - if used: + if used and avoid_used: for candidate in candidates: if candidate[0] not in used: return candidate From 61de9d400b7cf4f828b22bd64a3db9c95838b03e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:01:16 -0300 Subject: [PATCH 350/416] atlasbot: prefer hardware for general interest --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index ef6b88b6..e8e22a36 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-61 + checksum/atlasbot-configmap: manual-atlasbot-62 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 9434e913..f9e6b818 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1701,6 +1701,8 @@ def _select_insight( prefer_keys.extend(["hardware", "availability"]) if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")): prefer_keys.extend(["hardware", "cpu", "ram"]) + if "interesting" in q and "most interesting" not in q: + prefer_keys.extend(["hardware", "postgres", "cpu", "ram"]) avoid_used = any(word in q for word in ("another", "else", "different", "other")) or "most interesting" in q if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: for candidate in candidates: From 9361d003ff5cee8b77bac5a270fcc9cb024c876a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:04:29 -0300 Subject: [PATCH 351/416] atlasbot: treat hardware prompts as cluster queries --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index e8e22a36..36bb1dbf 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-62 + checksum/atlasbot-configmap: manual-atlasbot-63 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f9e6b818..4ca3b2ee 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -139,6 +139,8 @@ CLUSTER_HINT_WORDS = { "kubernetes", "node", "nodes", + "hardware", + "architecture", "worker", "workers", "pod", From c5b24119d301361f1eba4f271fd2bb652d3be0e0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:06:44 -0300 Subject: [PATCH 352/416] atlasbot: answer hardware mix queries --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 36bb1dbf..9cc0a1e9 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-63 + checksum/atlasbot-configmap: manual-atlasbot-64 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 4ca3b2ee..570bc26f 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1292,6 +1292,11 @@ def structured_answer( if not op and entity == "node": op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count" + if entity == "node" and ("hardware mix" in q or "architecture" in q): + hw_line = _hardware_mix_line(inventory) + if hw_line: + return _format_confidence(hw_line, "high") + if op == "top" and metric is None: metric = "cpu" From d721368f5109e7ddc4e7eb27d5b0246251508325 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:10:02 -0300 Subject: [PATCH 353/416] atlasbot: expand hardware and entity detection --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 9cc0a1e9..72503b81 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-64 + checksum/atlasbot-configmap: manual-atlasbot-65 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 570bc26f..2b3657a9 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -197,6 +197,8 @@ _INSIGHT_HINT_WORDS = { "favorite", "favourite", "trivia", + "stand out", + "stands out", } _OVERVIEW_HINT_WORDS = { @@ -213,8 +215,8 @@ _OLLAMA_LOCK = threading.Lock() HARDWARE_HINTS = { "amd64": ("amd64", "x86", "x86_64", "x86-64"), "jetson": ("jetson",), - "rpi4": ("rpi4",), - "rpi5": ("rpi5",), + "rpi4": ("rpi4", "raspberry pi 4", "raspberry pi-4"), + "rpi5": ("rpi5", "raspberry pi 5", "raspberry pi-5"), "rpi": ("rpi", "raspberry"), "arm64": ("arm64", "aarch64"), } @@ -559,7 +561,16 @@ def _detect_role_filters(q: str) -> set[str]: return roles def _detect_entity(q: str) -> str | None: - if "node" in q or "nodes" in q or "worker" in q or "hardware" in q or "architecture" in q or TITAN_NODE_RE.search(q): + if ( + "node" in q + or "nodes" in q + or "worker" in q + or "hardware" in q + or "architecture" in q + or "machine" in q + or "machines" in q + or TITAN_NODE_RE.search(q) + ): return "node" if "pod" in q or "pods" in q: return "pod" From a3fdf20e39d7c82cf03fe5ab482c3e37c043113c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:13:31 -0300 Subject: [PATCH 354/416] atlasbot: refine node and postgres query handling --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 72503b81..e1ff2bb4 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-65 + checksum/atlasbot-configmap: manual-atlasbot-66 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 2b3657a9..abdcbf25 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -538,7 +538,17 @@ def _detect_metric(q: str) -> str | None: def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: include: set[str] = set() exclude: set[str] = set() - rpi_specific = "rpi4" in q or "rpi5" in q + rpi_specific = any( + phrase in q + for phrase in ( + "rpi4", + "rpi5", + "raspberry pi 4", + "raspberry pi 5", + "raspberry pi-4", + "raspberry pi-5", + ) + ) for hardware, phrases in HARDWARE_HINTS.items(): if hardware == "rpi" and rpi_specific: continue @@ -1226,7 +1236,11 @@ def snapshot_metric_answer( hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {} parts: list[str] = [] if used is not None and max_conn is not None: - parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.") + free = max_conn - used + if any(word in q for word in ("free", "available", "remaining")): + parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max ({free:.0f} free).") + else: + parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.") if hottest.get("label"): hot_val = hottest.get("value") hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else "" @@ -1303,6 +1317,11 @@ def structured_answer( if not op and entity == "node": op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count" + if entity == "node" and "total" in q and "ready" in q: + summary = _nodes_summary_line(inventory, snapshot) + if summary: + return _format_confidence(summary, "high") + if entity == "node" and ("hardware mix" in q or "architecture" in q): hw_line = _hardware_mix_line(inventory) if hw_line: From 3a131fa1fc1b922601d9d48b203f5f1c9cd2022e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:37:20 -0300 Subject: [PATCH 355/416] atlasbot: strengthen subjective insights --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 58 +++++++++++++++++-------- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index e1ff2bb4..4ac3582d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-66 + checksum/atlasbot-configmap: manual-atlasbot-67 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index abdcbf25..0d0f92be 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -579,6 +579,10 @@ def _detect_entity(q: str) -> str | None: or "architecture" in q or "machine" in q or "machines" in q + or "host" in q + or "hosts" in q + or "hostname" in q + or "hostnames" in q or TITAN_NODE_RE.search(q) ): return "node" @@ -1775,20 +1779,29 @@ def _format_insight_text(key: str, text: str) -> str: .strip() .strip(".") ) + has_jetson = "jetson=" in counts + has_amd64 = "amd64=" in counts detail = f"mixed hardware stack ({counts})" - flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 nodes." + if has_jetson and has_amd64: + flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 boxes." + elif has_jetson: + flavor = "It pairs low-power Pis with Jetson accelerators for edge and AI workloads." + elif has_amd64: + flavor = "It mixes low-power Pis with a couple of heavier AMD64 nodes." + else: + flavor = "It is a pretty uniform hardware stack, which is rare for a homelab." return f"{detail}. {flavor}" if key == "postgres": detail = cleaned.replace("Postgres is at ", "") - return f"Postgres is at {detail}; that suggests moderate load." + return f"Postgres is at {detail}; that feels like healthy, steady load rather than strain." if key == "pods": detail = cleaned.replace("There are ", "") - return f"Pods look steady ({detail}); the workload mix looks healthy." + return f"Pods look steady ({detail}); nothing looks stuck or unhealthy." if key == "availability": - return cleaned + " That suggests the cluster is stable right now." + return cleaned + " That is the kind of stability I like to see." if key in ("cpu", "ram"): suffix = ( - " If you're chasing hotspots, that's the busiest workload right now." + " If you're chasing hotspots, that's the node I'd watch first." if key == "cpu" else " That box is carrying the heaviest memory load right now." ) @@ -1799,19 +1812,19 @@ def _format_insight_text(key: str, text: str) -> str: def _insight_prefix(prompt: str) -> str: q = normalize_query(prompt) if "coolest" in q: - return "If I had to pick the coolest detail: " + return "If I had to pick the coolest detail, I'd say " if "favorite" in q or "favourite" in q: - return "My favorite detail: " + return "My favorite detail is " if "trivia" in q: return "A bit of trivia I like: " if "most interesting" in q: - return "The most interesting detail to me: " + return "The most interesting detail to me is " if any(word in q for word in ("another", "else", "different", "other")): return "Another interesting detail: " if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): - return "What stands out is that " + return "What stands out to me is that " if any(word in q for word in ("interesting", "notable", "fun", "cool")): - return "One thing I'd highlight: " + return "One thing I'd call out is " return "" @@ -2389,6 +2402,21 @@ def _normalize_reply(value: Any) -> str: return _ensure_confidence(text) +def _history_payload_lines(history_payload: list[Any]) -> list[str]: + lines: list[str] = [] + if not isinstance(history_payload, list): + return lines + for item in history_payload[-12:]: + if isinstance(item, dict): + for key in ("content", "message", "text", "prompt", "question", "body", "answer", "reply", "response"): + val = item.get(key) + if isinstance(val, str) and val.strip(): + lines.append(val.strip()) + elif isinstance(item, str) and item.strip(): + lines.append(item.strip()) + return [line for line in lines if line] + + # Internal HTTP endpoint for cluster answers (website uses this). class _AtlasbotHandler(BaseHTTPRequestHandler): server_version = "AtlasbotHTTP/1.0" @@ -2439,15 +2467,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) history_payload = payload.get("history") or [] - history_lines: list[str] = [] - if isinstance(history_payload, list): - for item in history_payload[-10:]: - if isinstance(item, dict): - content = item.get("content") or item.get("message") or "" - if isinstance(content, str) and content.strip(): - history_lines.append(content.strip()) - elif isinstance(item, str) and item.strip(): - history_lines.append(item.strip()) + history_lines = _history_payload_lines(history_payload) history_cluster = _history_mentions_cluster( history_lines, inventory=inventory, From 52d28dcc6d407935314302a72f8d0aca06748d9e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 19:42:04 -0300 Subject: [PATCH 356/416] atlasbot: refine insight tone and status --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4ac3582d..609c2450 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-67 + checksum/atlasbot-configmap: manual-atlasbot-68 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 0d0f92be..db0f5609 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -118,7 +118,7 @@ CONFIDENCE_RE = re.compile(r"confidence\s*:\s*(high|medium|low)\b", re.IGNORECAS OPERATION_HINTS = { "count": ("how many", "count", "number", "total"), "list": ("list", "which", "what are", "show", "names"), - "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum"), + "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum", "busiest", "busy"), "status": ("ready", "not ready", "unready", "down", "missing", "status"), } @@ -1414,6 +1414,11 @@ def structured_answer( names = [node["name"] for node in filtered] if op == "status": + if "missing" in q and ("ready" in q or "readiness" in q): + return _format_confidence( + "Not ready nodes: " + (", ".join(names) if names else "none") + ".", + "high", + ) if "missing" in q and expected_workers: missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) return _format_confidence( From b7d957ecd84b51eed8dc1699980dc09b255a9cf5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 20:02:09 -0300 Subject: [PATCH 357/416] atlasbot: route subjective queries to LLM --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 60 +++++++++++++++++-------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 609c2450..d8ce3ee8 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-68 + checksum/atlasbot-configmap: manual-atlasbot-69 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index db0f5609..141b9714 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1911,19 +1911,6 @@ def cluster_answer( history_lines: list[str] | None = None, ) -> str: metrics_summary = snapshot_context(prompt, snapshot) - if _is_insight_query(prompt): - candidates = _insight_candidates(inventory, snapshot) - used_keys = _recent_insight_keys(history_lines or []) - selected = _select_insight(prompt, candidates, used_keys=used_keys) - if selected: - key, raw_text, confidence = selected - formatted = _format_insight_text(key, raw_text) - if not formatted: - formatted = raw_text - prefix = _insight_prefix(prompt) - if prefix: - formatted = prefix + formatted - return _format_confidence(formatted, confidence) structured = structured_answer( prompt, inventory=inventory, @@ -2422,6 +2409,17 @@ def _history_payload_lines(history_payload: list[Any]) -> list[str]: return [line for line in lines if line] +def _append_history_context(context: str, history_lines: list[str]) -> str: + lines = [line.strip() for line in history_lines if isinstance(line, str) and line.strip()] + if not lines: + return context + snippet = "\n".join(lines[-6:]) + combined = context + "\nRecent chat:\n" + snippet if context else "Recent chat:\n" + snippet + if len(combined) > MAX_CONTEXT_CHARS: + combined = combined[: MAX_CONTEXT_CHARS - 3].rstrip() + "..." + return combined + + # Internal HTTP endpoint for cluster answers (website uses this). class _AtlasbotHandler(BaseHTTPRequestHandler): server_version = "AtlasbotHTTP/1.0" @@ -2493,15 +2491,25 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): ) fallback = "I don't have enough data to answer that." if cluster_query: - answer = cluster_answer( + facts_answer = cluster_answer( cleaned, inventory=inventory, snapshot=snapshot, workloads=workloads, history_lines=history_lines, ) - if not answer: - answer = fallback + open_ended = _is_subjective_query(cleaned) or _knowledge_intent(cleaned) + if open_ended: + llm_context = _append_history_context(context, history_lines) + answer = ollama_reply( + ("http", "internal"), + cleaned, + context=llm_context, + fallback=facts_answer or fallback, + use_history=False, + ) + else: + answer = facts_answer or fallback else: llm_prompt = cleaned answer = ollama_reply( @@ -2761,11 +2769,13 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru "When a cluster snapshot is provided, never answer about unrelated meanings of 'Atlas' (maps, mythology, Apache Atlas, etc). " "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. " "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. " + "For subjective prompts (interesting, favorite, unconventional), pick one or two observations from the context, explain why they stand out in 1-2 sentences, and avoid repeating the same observation as the last response if you can. " "Prefer exact repo paths and Kubernetes resource names when relevant. " "Never include or request secret values. " "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " "Translate metrics into natural language instead of echoing raw label/value pairs. " + "Avoid bare lists unless the user asked for a list; weave numbers into sentences. " "Do not answer by only listing runbooks; if the question is about Atlas/Othrys, summarize the cluster first and mention docs only if useful. " "If the question is not about Atlas/Othrys and no cluster context is provided, answer using general knowledge and say when you are unsure. " "If the answer is not grounded in the provided context or tool data, say you do not know. " @@ -2974,15 +2984,27 @@ def sync_loop(token: str, room_id: str): fallback = "I don't have enough data to answer that." if cluster_query: - reply = cluster_answer( + facts_answer = cluster_answer( cleaned_body, inventory=inventory, snapshot=snapshot, workloads=workloads, history_lines=history[hist_key], ) - if not reply: - reply = fallback + open_ended = _is_subjective_query(cleaned_body) or _knowledge_intent(cleaned_body) + if open_ended: + llm_context = _append_history_context(context, history[hist_key]) + reply = ollama_reply_with_thinking( + token, + rid, + hist_key, + cleaned_body, + context=llm_context, + fallback=facts_answer or fallback, + use_history=False, + ) + else: + reply = facts_answer or fallback else: llm_prompt = cleaned_body reply = ollama_reply_with_thinking( From df56eeddb32e7a0ab503a2bf10c92da70bdcb0d6 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:02:20 -0300 Subject: [PATCH 358/416] atlasbot: refine open-ended reasoning pipeline --- services/comms/atlasbot-deployment.yaml | 6 +- services/comms/scripts/atlasbot/bot.py | 446 +++++++++++++++++++++--- 2 files changed, 401 insertions(+), 51 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d8ce3ee8..cc628dd9 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-69 + checksum/atlasbot-configmap: manual-atlasbot-70 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -78,11 +78,11 @@ spec: - name: BOT_USER value: atlasbot - name: BOT_MENTIONS - value: atlasbot,aatlasbot + value: atlasbot,aatlasbot,atlas_quick,atlas_smart - name: OLLAMA_URL value: http://ollama.ai.svc.cluster.local:11434 - name: OLLAMA_MODEL - value: qwen2.5:14b-instruct-q4_0 + value: qwen2.5:14b-instruct - name: OLLAMA_TIMEOUT_SEC value: "600" - name: ATLASBOT_THINKING_INTERVAL_SEC diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 141b9714..aa7e6148 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -333,6 +333,19 @@ def _strip_bot_mention(text: str) -> str: return cleaned or text.strip() +def _detect_mode_from_body(body: str, *, default: str = "deep") -> str: + lower = normalize_query(body or "") + if "atlas_quick" in lower or "atlas-quick" in lower: + return "fast" + if "atlas_smart" in lower or "atlas-smart" in lower: + return "deep" + if lower.startswith("quick ") or lower.startswith("fast "): + return "fast" + if lower.startswith("smart ") or lower.startswith("deep "): + return "deep" + return default + + # Matrix HTTP helper. def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None): url = (base or BASE) + path @@ -2420,6 +2433,300 @@ def _append_history_context(context: str, history_lines: list[str]) -> str: return combined +class ThoughtState: + def __init__(self, total_steps: int = 0): + self._lock = threading.Lock() + self.stage = "starting" + self.note = "" + self.step = 0 + self.total_steps = total_steps + + def update(self, stage: str, *, note: str = "", step: int | None = None) -> None: + with self._lock: + self.stage = stage + if note: + self.note = note + if step is not None: + self.step = step + + def status_line(self) -> str: + with self._lock: + stage = self.stage + note = self.note + step = self.step + total = self.total_steps + step_part = f"{step}/{total}" if total else str(step) if step else "" + detail = f"Stage {step_part}: {stage}".strip() + if note: + return f"Still thinking ({detail}). Latest insight: {note}" + return f"Still thinking ({detail})." + + +def _ollama_json_call(prompt: str, *, context: str, retries: int = 2) -> dict[str, Any]: + system = ( + "System: You are Atlas, a reasoning assistant. " + "Return strict JSON only (no code fences, no trailing commentary). " + "If you cannot comply, return {}. " + "Only use facts from the provided context. " + "If you make an inference, label it as 'inference' in the JSON." + ) + last_exc: Exception | None = None + for attempt in range(max(1, retries + 1)): + try: + raw = _ollama_call( + ("json", "internal"), + prompt, + context=context, + use_history=False, + system_override=system, + ) + cleaned = _strip_code_fence(raw).strip() + if cleaned.startswith("{") and cleaned.endswith("}"): + return json.loads(cleaned) + last = json.loads(_strip_code_fence(cleaned)) + if isinstance(last, dict): + return last + except Exception as exc: # noqa: BLE001 + last_exc = exc + time.sleep(min(2, 2 ** attempt)) + if last_exc: + return {} + return {} + + +def _fact_pack_lines( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]] | None, +) -> list[str]: + raw = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + lines: list[str] = [] + for line in raw.splitlines(): + trimmed = line.strip() + if not trimmed or trimmed.lower().startswith("facts"): + continue + lines.append(trimmed) + return lines + + +def _fact_pack_text(lines: list[str]) -> str: + labeled = [f"F{idx + 1}: {line}" for idx, line in enumerate(lines)] + return "Fact pack:\n" + "\n".join(labeled) + + +def _open_ended_system() -> str: + return ( + "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " + "Use ONLY the provided fact pack and recent chat as your evidence. " + "You may draw light inferences if you label them as such. " + "Write concise, human sentences, not a list. " + "If the question is subjective, share a light opinion grounded in facts. " + "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " + "Avoid repeating the exact same observation as the last response if possible. " + "Do not invent numbers or facts. " + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100)." + ) + + +def _candidate_note(candidate: dict[str, Any]) -> str: + claim = str(candidate.get("claim") or candidate.get("summary") or "") + return claim[:160] + ("…" if len(claim) > 160 else "") + + +def _ensure_scores(answer: str) -> str: + text = answer.strip() + lines = [line for line in text.splitlines() if line.strip()] + has_relevance = any(line.lower().startswith("relevance:") for line in lines) + has_satisfaction = any(line.lower().startswith("satisfaction:") for line in lines) + has_confidence = any("confidence:" in line.lower() for line in lines) + if not has_confidence: + lines.append("Confidence: medium") + if not has_relevance: + lines.append("Relevance: 70") + if not has_satisfaction: + lines.append("Satisfaction: 70") + return "\n".join(lines) + + +def _open_ended_fast( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + state: ThoughtState | None = None, +) -> str: + if state: + state.update("synthesizing", step=2) + synthesis_prompt = ( + "You are given a question and a fact pack. " + "Answer in 2-4 sentences, using only facts from the pack. " + "Pick one or two facts that best fit the question and explain why they matter. " + "If the question is subjective, add a light opinion grounded in those facts. " + "Do not list raw facts; speak naturally. " + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" + f"Question: {prompt}" + ) + context = _append_history_context(fact_pack, history_lines) + reply = _ollama_call( + ("fast", "open"), + synthesis_prompt, + context=context, + use_history=False, + system_override=_open_ended_system(), + ) + return _ensure_scores(reply) + + +def _interpret_open_question( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], +) -> dict[str, Any]: + prompt_text = ( + "Analyze the question against the fact pack. " + "Return JSON: {\"focus\":\"...\",\"preference\":\"balanced|novelty|utilization|stability|risk\"," + "\"notes\":\"...\"}. " + "Use only the fact pack." + ) + context = _append_history_context(fact_pack, history_lines) + analysis = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) + if not isinstance(analysis, dict): + return {"focus": "cluster snapshot", "preference": "balanced", "notes": ""} + preference = analysis.get("preference") or "balanced" + if preference not in ("balanced", "novelty", "utilization", "stability", "risk"): + preference = "balanced" + analysis["preference"] = preference + analysis.setdefault("focus", "cluster snapshot") + analysis.setdefault("notes", "") + return analysis + + +def _select_insights( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + state: ThoughtState, +) -> list[dict[str, Any]]: + insight_prompt = ( + "From the fact pack, select 3-5 candidate insights that could answer the question. " + "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"]," + "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\"}]}. " + "Use only the fact pack." + ) + state.update("drafting candidates", step=2) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call(insight_prompt + f" Question: {prompt}", context=context) + insights = result.get("insights") if isinstance(result, dict) else None + if not isinstance(insights, list): + return [] + cleaned: list[dict[str, Any]] = [] + for item in insights: + if not isinstance(item, dict): + continue + if not item.get("summary") or not item.get("fact_ids"): + continue + cleaned.append(item) + state.update("drafting candidates", step=2, note=_candidate_note(item)) + return cleaned + + +def _score_insight(insight: dict[str, Any], preference: str) -> float: + relevance = insight.get("relevance") if isinstance(insight.get("relevance"), (int, float)) else 0.0 + novelty = insight.get("novelty") if isinstance(insight.get("novelty"), (int, float)) else 0.0 + if preference == "novelty": + return 0.4 * relevance + 0.6 * novelty + if preference == "utilization": + return 0.7 * relevance + 0.3 * novelty + if preference == "stability": + return 0.7 * relevance + 0.3 * novelty + if preference == "risk": + return 0.6 * relevance + 0.4 * novelty + return 0.6 * relevance + 0.4 * novelty + + +def _open_ended_deep( + prompt: str, + *, + fact_pack: str, + fact_ids: set[str], + history_lines: list[str], + state: ThoughtState | None = None, +) -> str: + state = state or ThoughtState() + if not fact_ids: + return _ensure_scores("I don't have enough data to answer that.") + state.total_steps = 6 + state.update("planning", step=1) + analysis = _interpret_open_question(prompt, fact_pack=fact_pack, history_lines=history_lines) + state.update("planning", step=1, note=str(analysis.get("focus") or "")) + + candidates = _select_insights(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state) + state.update("verifying", step=3) + filtered: list[dict[str, Any]] = [] + for cand in candidates: + cites = cand.get("fact_ids") if isinstance(cand.get("fact_ids"), list) else [] + if cites and not all(cite in fact_ids for cite in cites): + continue + filtered.append(cand) + if not filtered: + filtered = candidates + + preference = analysis.get("preference", "balanced") + ranked = sorted(filtered, key=lambda item: _score_insight(item, preference), reverse=True) + top = ranked[:2] + state.update("synthesizing", step=4) + synth_prompt = ( + "Use the question, fact pack, and selected insights to craft a concise answer. " + "Write 2-4 sentences. Explain why the selected insights stand out. " + "If the question is subjective, include a light opinion grounded in facts. " + "Avoid repeating the same observation as the last response if possible. " + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" + f"Question: {prompt}\n" + f"Interpretation: {json.dumps(analysis, ensure_ascii=False)}\n" + f"Selected: {json.dumps(top, ensure_ascii=False)}" + ) + context = _append_history_context(fact_pack, history_lines) + reply = _ollama_call( + ("deep", "open"), + synth_prompt, + context=context, + use_history=False, + system_override=_open_ended_system(), + ) + state.update("done", step=6) + return _ensure_scores(reply) + + +def open_ended_answer( + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]], + history_lines: list[str], + mode: str, + state: ThoughtState | None = None, +) -> str: + lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + if not lines: + return _ensure_scores("I don't have enough data to answer that.") + fact_pack = _fact_pack_text(lines) + fact_ids = {f"F{i+1}" for i in range(len(lines))} + if mode == "fast": + return _open_ended_fast(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state) + return _open_ended_deep(prompt, fact_pack=fact_pack, fact_ids=fact_ids, history_lines=history_lines, state=state) + + +def _non_cluster_reply(prompt: str) -> str: + return _ensure_scores( + "I focus on the Atlas/Othrys cluster and don't have enough data to answer that." + ) + + # Internal HTTP endpoint for cluster answers (website uses this). class _AtlasbotHandler(BaseHTTPRequestHandler): server_version = "AtlasbotHTTP/1.0" @@ -2466,6 +2773,9 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): self._write_json(400, {"error": "missing_prompt"}) return cleaned = _strip_bot_mention(prompt) + mode = str(payload.get("mode") or "fast").lower() + if mode not in ("fast", "deep"): + mode = "fast" snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) @@ -2491,34 +2801,30 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): ) fallback = "I don't have enough data to answer that." if cluster_query: - facts_answer = cluster_answer( - cleaned, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history_lines, - ) open_ended = _is_subjective_query(cleaned) or _knowledge_intent(cleaned) if open_ended: - llm_context = _append_history_context(context, history_lines) - answer = ollama_reply( - ("http", "internal"), + answer = open_ended_answer( cleaned, - context=llm_context, - fallback=facts_answer or fallback, - use_history=False, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history_lines, + mode=mode, + state=None, ) else: - answer = facts_answer or fallback + answer = ( + cluster_answer( + cleaned, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history_lines, + ) + or fallback + ) else: - llm_prompt = cleaned - answer = ollama_reply( - ("http", "internal"), - llm_prompt, - context=context, - fallback=fallback, - use_history=False, - ) + answer = _non_cluster_reply(cleaned) self._write_json(200, {"answer": answer}) @@ -2760,8 +3066,15 @@ def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str: summary = "\n".join(parts).strip() return _format_confidence(summary, "medium") if summary else "" -def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = True) -> str: - system = ( +def _ollama_call( + hist_key, + prompt: str, + *, + context: str, + use_history: bool = True, + system_override: str | None = None, +) -> str: + system = system_override or ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Be helpful, direct, and concise. " "Use the provided context and facts as your source of truth. " @@ -2877,6 +3190,47 @@ def ollama_reply_with_thinking( thread.join(timeout=1) return result["reply"] or fallback or "Model backend is busy. Try again in a moment." + +def open_ended_with_thinking( + token: str, + room: str, + prompt: str, + *, + inventory: list[dict[str, Any]], + snapshot: dict[str, Any] | None, + workloads: list[dict[str, Any]], + history_lines: list[str], + mode: str, +) -> str: + result: dict[str, str] = {"reply": ""} + done = threading.Event() + total_steps = 2 if mode == "fast" else 6 + state = ThoughtState(total_steps=total_steps) + + def worker(): + result["reply"] = open_ended_answer( + prompt, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history_lines, + mode=mode, + state=state, + ) + done.set() + + thread = threading.Thread(target=worker, daemon=True) + thread.start() + if not done.wait(2.0): + send_msg(token, room, "Thinking…") + heartbeat = max(10, THINKING_INTERVAL_SEC) + next_heartbeat = time.monotonic() + heartbeat + while not done.wait(max(0, next_heartbeat - time.monotonic())): + send_msg(token, room, state.status_line()) + next_heartbeat += heartbeat + thread.join(timeout=1) + return result["reply"] or "Model backend is busy. Try again in a moment." + def sync_loop(token: str, room_id: str): since = None try: @@ -2931,6 +3285,7 @@ def sync_loop(token: str, room_id: str): cleaned_body = _strip_bot_mention(body) lower_body = cleaned_body.lower() + mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep") # Only do live cluster introspection in DMs. allow_tools = is_dm @@ -2984,39 +3339,34 @@ def sync_loop(token: str, room_id: str): fallback = "I don't have enough data to answer that." if cluster_query: - facts_answer = cluster_answer( - cleaned_body, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history[hist_key], - ) open_ended = _is_subjective_query(cleaned_body) or _knowledge_intent(cleaned_body) if open_ended: - llm_context = _append_history_context(context, history[hist_key]) - reply = ollama_reply_with_thinking( + reply = open_ended_with_thinking( token, rid, - hist_key, cleaned_body, - context=llm_context, - fallback=facts_answer or fallback, - use_history=False, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history[hist_key], + mode=mode if mode in ("fast", "deep") else "deep", ) else: - reply = facts_answer or fallback + reply = ( + cluster_answer( + cleaned_body, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history[hist_key], + ) + or fallback + ) else: - llm_prompt = cleaned_body - reply = ollama_reply_with_thinking( - token, - rid, - hist_key, - llm_prompt, - context=context, - fallback=fallback, - use_history=False, - ) + reply = _non_cluster_reply(cleaned_body) send_msg(token, rid, reply) + history[hist_key].append(f"Atlas: {reply}") + history[hist_key] = history[hist_key][-80:] def login_with_retry(): last_err = None From 2cab1d2f280d4306f4d34d4b04562a1af3ea5be6 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:09:48 -0300 Subject: [PATCH 359/416] atlasbot: guard open-ended LLM calls --- services/comms/scripts/atlasbot/bot.py | 28 ++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index aa7e6148..47458ea3 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2530,6 +2530,26 @@ def _open_ended_system() -> str: ) +def _ollama_call_safe( + hist_key, + prompt: str, + *, + context: str, + fallback: str, + system_override: str | None = None, +) -> str: + try: + return _ollama_call( + hist_key, + prompt, + context=context, + use_history=False, + system_override=system_override, + ) + except Exception: + return fallback + + def _candidate_note(candidate: dict[str, Any]) -> str: claim = str(candidate.get("claim") or candidate.get("summary") or "") return claim[:160] + ("…" if len(claim) > 160 else "") @@ -2569,11 +2589,11 @@ def _open_ended_fast( f"Question: {prompt}" ) context = _append_history_context(fact_pack, history_lines) - reply = _ollama_call( + reply = _ollama_call_safe( ("fast", "open"), synthesis_prompt, context=context, - use_history=False, + fallback="I don't have enough data to answer that.", system_override=_open_ended_system(), ) return _ensure_scores(reply) @@ -2690,11 +2710,11 @@ def _open_ended_deep( f"Selected: {json.dumps(top, ensure_ascii=False)}" ) context = _append_history_context(fact_pack, history_lines) - reply = _ollama_call( + reply = _ollama_call_safe( ("deep", "open"), synth_prompt, context=context, - use_history=False, + fallback="I don't have enough data to answer that.", system_override=_open_ended_system(), ) state.update("done", step=6) From 9c042c78361fe9f7d13aa4ccf6da059b771c05f8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:11:58 -0300 Subject: [PATCH 360/416] atlasbot: bump rollout checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index cc628dd9..97567eb6 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-70 + checksum/atlasbot-configmap: manual-atlasbot-71 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 056f512d6769b7980aeab77fd0ddc698f33137a7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:16:47 -0300 Subject: [PATCH 361/416] atlasbot: add model fallback and rollout --- services/comms/atlasbot-deployment.yaml | 4 +++- services/comms/scripts/atlasbot/bot.py | 24 +++++++++++++++++------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 97567eb6..7414f1e0 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-71 + checksum/atlasbot-configmap: manual-atlasbot-72 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -83,6 +83,8 @@ spec: value: http://ollama.ai.svc.cluster.local:11434 - name: OLLAMA_MODEL value: qwen2.5:14b-instruct + - name: OLLAMA_FALLBACK_MODEL + value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC value: "600" - name: ATLASBOT_THINKING_INTERVAL_SEC diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 47458ea3..2c93b759 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -17,6 +17,7 @@ ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") +FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "") API_KEY = os.environ.get("CHAT_API_KEY", "") OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090")) @@ -3133,14 +3134,23 @@ def _ollama_call( if lock: lock.acquire() try: - with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: - data = json.loads(resp.read().decode()) - msg = data.get("message") if isinstance(data, dict) else None - if isinstance(msg, dict): - raw_reply = msg.get("content") + try: + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: + data = json.loads(resp.read().decode()) + except error.HTTPError as exc: + if exc.code == 404 and FALLBACK_MODEL and FALLBACK_MODEL != payload["model"]: + payload["model"] = FALLBACK_MODEL + r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers) + with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp: + data = json.loads(resp.read().decode()) else: - raw_reply = data.get("response") or data.get("reply") or data - reply = _normalize_reply(raw_reply) or "I'm here to help." + raise + msg = data.get("message") if isinstance(data, dict) else None + if isinstance(msg, dict): + raw_reply = msg.get("content") + else: + raw_reply = data.get("response") or data.get("reply") or data + reply = _normalize_reply(raw_reply) or "I'm here to help." if use_history: history[hist_key].append(f"Atlas: {reply}") return reply From 1f6bbceb2446856fc2f9ffc19ac188b1d2129900 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:27:19 -0300 Subject: [PATCH 362/416] atlasbot: improve metric parsing and cluster intent --- services/comms/scripts/atlasbot/bot.py | 48 +++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 2c93b759..b9bc0e64 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -190,6 +190,8 @@ _INSIGHT_HINT_WORDS = { "surprising", "weird", "odd", + "unusual", + "outlier", "fun", "cool", "unique", @@ -540,6 +542,13 @@ def _detect_operation(q: str) -> str | None: def _detect_metric(q: str) -> str | None: tokens = set(_tokens(q)) + expanded: set[str] = set(tokens) + for token in list(tokens): + for part in re.split(r"[-_]", token): + part = part.strip() + if len(part) >= 2: + expanded.add(part) + tokens = expanded for metric, phrases in METRIC_HINTS.items(): for phrase in phrases: if " " in phrase: @@ -1271,6 +1280,19 @@ def snapshot_metric_answer( pending = metrics.get("pods_pending") failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") + status_terms = ("running", "pending", "failed", "succeeded", "completed") + if sum(1 for term in status_terms if term in q) > 1: + parts = [] + if running is not None: + parts.append(f"running {running:.0f}") + if pending is not None: + parts.append(f"pending {pending:.0f}") + if failed is not None: + parts.append(f"failed {failed:.0f}") + if succeeded is not None: + parts.append(f"succeeded {succeeded:.0f}") + if parts: + return _format_confidence(f"Pods: {', '.join(parts)}.", "high") if "pending" in q and pending is not None: return _format_confidence(f"Pending pods: {pending:.0f}.", "high") if "failed" in q and failed is not None: @@ -1345,7 +1367,17 @@ def structured_answer( if hw_line: return _format_confidence(hw_line, "high") - if op == "top" and metric is None: + if entity == "node" and op == "status" and metric is None: + summary = _nodes_summary_line(inventory, snapshot) + if summary: + return _format_confidence(summary, "high") + + if entity == "node" and metric is None and any(word in q for word in ("hardware", "architecture", "class", "mix")): + hw_line = _hardware_mix_line(inventory) + if hw_line: + return _format_confidence(hw_line, "medium") + + if op == "top" and metric is None and not any(word in q for word in ("hardware", "architecture", "class")): metric = "cpu" # Metrics-first when a metric or top operation is requested. @@ -2807,8 +2839,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): inventory=inventory, workloads=workloads, ) - cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) or ( - _is_subjective_query(cleaned) and history_cluster + cluster_query = ( + _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) + or history_cluster + or _knowledge_intent(cleaned) + or _is_subjective_query(cleaned) ) context = "" if cluster_query: @@ -3347,8 +3382,11 @@ def sync_loop(token: str, room_id: str): inventory=inventory, workloads=workloads, ) - cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) or ( - _is_subjective_query(cleaned_body) and history_cluster + cluster_query = ( + _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) + or history_cluster + or _knowledge_intent(cleaned_body) + or _is_subjective_query(cleaned_body) ) context = "" if cluster_query: From fdf4896f7c10037cfc90d565bb8d107ca04794b1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:27:52 -0300 Subject: [PATCH 363/416] atlasbot: roll pod after metric parsing update --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7414f1e0..4e27b5a7 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-72 + checksum/atlasbot-configmap: manual-atlasbot-73 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From b4f5fbeb2b3d3b6c50c7c5150637600efe7e4325 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:43:37 -0300 Subject: [PATCH 364/416] monitoring: unify gpu namespace usage --- scripts/dashboards_render_atlas.py | 47 ++++++++++++++----- services/monitoring/dashboards/atlas-gpu.json | 6 +-- .../monitoring/dashboards/atlas-overview.json | 2 +- .../monitoring/grafana-dashboard-gpu.yaml | 6 +-- .../grafana-dashboard-overview.yaml | 2 +- 5 files changed, 42 insertions(+), 21 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 445de94b..2e5c73b6 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -208,32 +208,53 @@ def namespace_ram_raw(scope_var): def namespace_gpu_usage_instant(scope_var): - dcgm = f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)" - jetson = jetson_gpu_usage_by_namespace(scope_var) - merged = ( - f'label_replace({dcgm}, "source", "dcgm", "", "") ' - f'or label_replace({jetson}, "source", "jetson", "", "")' - ) - return f"sum by (namespace) ({merged})" + return gpu_usage_by_namespace(scope_var) def jetson_gpu_util_by_node(): return 'max by (node) (jetson_gr3d_freq_percent{node!=""})' -def jetson_gpu_util_by_hostname(): +def dcgm_gpu_util_by_node(): + dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")' + dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")' return ( - 'label_replace(max by (node) (jetson_gr3d_freq_percent{node!=""}), ' - '"Hostname", "$1", "node", "(.*)")' + "avg by (node) (" + f"{dcgm_ns} * on(namespace,pod) group_left(node) " + 'kube_pod_info{namespace="monitoring"}' + ")" ) -def jetson_gpu_requests(scope_var): +def gpu_util_by_node(): + return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}" + + +def gpu_util_by_hostname(): + return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")' + + +def gpu_node_labels(): + return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}' + + +def gpu_requests_by_namespace_node(scope_var): return ( "sum by (namespace,node) (" f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} ' "* on(namespace,pod) group_left(node) kube_pod_info " - '* on(node) group_left(label_jetson) kube_node_labels{label_jetson="true"}' + f"* on(node) group_left() {gpu_node_labels()}" + ")" + ) + + +def gpu_usage_by_namespace(scope_var): + requests_by_ns = gpu_requests_by_namespace_node(scope_var) + total_by_node = f"sum by (node) ({requests_by_ns})" + return ( + "sum by (namespace) (" + f"({requests_by_ns}) / clamp_min({total_by_node}, 1) " + f"* on(node) group_left() {gpu_util_by_node()}" ")" ) @@ -2695,7 +2716,7 @@ def build_gpu_dashboard(): timeseries_panel( 3, "GPU Util by Node", - f'(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{{pod!=""}})) or ({jetson_gpu_util_by_hostname()})', + gpu_util_by_hostname(), {"h": 8, "w": 12, "x": 0, "y": 8}, unit="percent", legend="{{Hostname}}", diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 132f2766..8542c5e7 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -89,7 +89,7 @@ }, "targets": [ { - "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))", + "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -126,7 +126,7 @@ }, "targets": [ { - "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))", + "expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index b212c8cd..31b78674 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1901,7 +1901,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 55f63e84..8d3a3dd5 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -98,7 +98,7 @@ data: }, "targets": [ { - "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))", + "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -135,7 +135,7 @@ data: }, "targets": [ { - "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))", + "expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{Hostname}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index a8990024..2a7cc2b7 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1910,7 +1910,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } From a255c60aed6d900500a397a03834cf0fef7a1fa3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:46:58 -0300 Subject: [PATCH 365/416] monitoring: fix gpu idle label --- scripts/dashboards_render_atlas.py | 4 ++-- services/monitoring/dashboards/atlas-gpu.json | 4 ++-- services/monitoring/dashboards/atlas-overview.json | 2 +- services/monitoring/grafana-dashboard-gpu.yaml | 4 ++-- services/monitoring/grafana-dashboard-overview.yaml | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 2e5c73b6..5db798d7 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -243,7 +243,7 @@ def gpu_requests_by_namespace_node(scope_var): "sum by (namespace,node) (" f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} ' "* on(namespace,pod) group_left(node) kube_pod_info " - f"* on(node) group_left() {gpu_node_labels()}" + f"* on(node) group_left() ({gpu_node_labels()})" ")" ) @@ -254,7 +254,7 @@ def gpu_usage_by_namespace(scope_var): return ( "sum by (namespace) (" f"({requests_by_ns}) / clamp_min({total_by_node}, 1) " - f"* on(node) group_left() {gpu_util_by_node()}" + f"* on(node) group_left() ({gpu_util_by_node()})" ")" ) diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 8542c5e7..6f993d9b 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -89,7 +89,7 @@ }, "targets": [ { - "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))", + "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 31b78674..1f8635bc 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1901,7 +1901,7 @@ }, "targets": [ { - "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 8d3a3dd5..34079636 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -98,7 +98,7 @@ data: }, "targets": [ { - "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))", + "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 2a7cc2b7..fdfe1a70 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1910,7 +1910,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))", + "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } From ee13e8da30edb98cd85f36f0e01c8a98070d0c0d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 21:52:07 -0300 Subject: [PATCH 366/416] atlasbot: refine open-ended reasoning --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 410 ++++++++++++++++++++++-- 2 files changed, 378 insertions(+), 34 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 4e27b5a7..5e5bc05d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-73 + checksum/atlasbot-configmap: manual-atlasbot-74 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index b9bc0e64..01762934 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -138,6 +138,7 @@ CLUSTER_HINT_WORDS = { "cluster", "k8s", "kubernetes", + "health", "node", "nodes", "hardware", @@ -211,6 +212,7 @@ _OVERVIEW_HINT_WORDS = { "explain", "tell me about", "what do you know", + "health", } _OLLAMA_LOCK = threading.Lock() @@ -1220,6 +1222,8 @@ def snapshot_metric_answer( q = normalize_query(prompt) metric = _detect_metric(q) op = _detect_operation(q) + if op == "list" and metric in {"cpu", "ram", "net", "io"}: + op = "top" include_hw, exclude_hw = _detect_hardware_filters(q) nodes_in_query = _extract_titan_nodes(q) only_workers = "worker" in q or "workers" in q @@ -1340,6 +1344,8 @@ def structured_answer( tokens = _tokens(q) op = _detect_operation(q) metric = _detect_metric(q) + if op == "list" and metric in {"cpu", "ram", "net", "io"}: + op = "top" entity = _detect_entity(q) include_hw, exclude_hw = _detect_hardware_filters(q) nodes_in_query = _extract_titan_nodes(q) @@ -1646,6 +1652,37 @@ def _is_insight_query(query: str) -> bool: return False +_FOLLOWUP_HINTS = ( + "what about", + "how about", + "and what", + "and how", + "tell me more", + "anything else", + "something else", + "that one", + "those", + "them", + "it", + "this", + "that", + "else", + "another", + "again", +) + + +def _is_followup_query(query: str) -> bool: + q = normalize_query(query) + if not q: + return False + if any(hint in q for hint in _FOLLOWUP_HINTS): + return True + if len(q.split()) <= 3 and not any(word in q for word in _INSIGHT_HINT_WORDS): + return True + return False + + def _is_subjective_query(query: str) -> bool: q = normalize_query(query) if not q: @@ -2541,6 +2578,12 @@ def _fact_pack_lines( if not trimmed or trimmed.lower().startswith("facts"): continue lines.append(trimmed) + if _knowledge_intent(prompt) or _doc_intent(prompt) or _is_overview_query(prompt): + kb_titles = kb_retrieve_titles(prompt, limit=4) + if kb_titles: + for kb_line in kb_titles.splitlines(): + if kb_line.strip(): + lines.append(kb_line.strip()) return lines @@ -2549,12 +2592,194 @@ def _fact_pack_text(lines: list[str]) -> str: return "Fact pack:\n" + "\n".join(labeled) +_ALLOWED_INSIGHT_TAGS = { + "availability", + "architecture", + "database", + "hardware", + "inventory", + "node_detail", + "os", + "pods", + "utilization", + "workloads", + "workers", +} + +_DYNAMIC_TAGS = {"availability", "database", "pods", "utilization", "workloads"} +_INVENTORY_TAGS = {"hardware", "architecture", "inventory", "workers", "node_detail", "os"} + + +def _fact_line_tags(line: str) -> set[str]: + text = (line or "").lower() + tags: set[str] = set() + if any(key in text for key in ("nodes_total", "ready", "not_ready", "workers_ready", "workers_not_ready")): + tags.add("availability") + if "nodes_by_arch" in text or "arch " in text or "architecture" in text: + tags.add("architecture") + if any(key in text for key in ("rpi", "jetson", "amd64", "arm64", "non_raspberry_pi")): + tags.update({"hardware", "inventory"}) + if "control_plane_nodes" in text or "worker_nodes" in text: + tags.add("inventory") + if any(key in text for key in ("hottest_", "node_usage", "cpu=", "ram=", "net=", "io=")): + tags.add("utilization") + if "postgres_" in text or "postgres connections" in text: + tags.add("database") + if "pods_" in text or "pod phases" in text: + tags.add("pods") + if "workloads" in text or "primary_node" in text: + tags.add("workloads") + if "node_details" in text: + tags.add("node_detail") + if "os mix" in text or "os" in text: + tags.add("os") + return tags & _ALLOWED_INSIGHT_TAGS + + +def _fact_pack_meta(lines: list[str]) -> dict[str, dict[str, Any]]: + meta: dict[str, dict[str, Any]] = {} + for idx, line in enumerate(lines): + fid = f"F{idx + 1}" + tags = sorted(_fact_line_tags(line)) + meta[fid] = {"tags": tags} + return meta + + +def _history_tags(history_lines: list[str]) -> set[str]: + tags: set[str] = set() + for line in history_lines[-6:]: + tags.update(_fact_line_tags(line)) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _seed_insights( + lines: list[str], + fact_meta: dict[str, dict[str, Any]], + *, + limit: int = 6, +) -> list[dict[str, Any]]: + priority = [ + "utilization", + "database", + "pods", + "workloads", + "availability", + "hardware", + "architecture", + "inventory", + ] + seeds: list[dict[str, Any]] = [] + used_tags: set[str] = set() + for tag in priority: + for idx, line in enumerate(lines): + fid = f"F{idx + 1}" + tags = set(fact_meta.get(fid, {}).get("tags") or []) + if tag not in tags or fid in {s["fact_ids"][0] for s in seeds}: + continue + summary = line.lstrip("- ").strip() + seeds.append( + { + "summary": summary, + "fact_ids": [fid], + "relevance": 0.5, + "novelty": 0.5, + "rationale": "seeded from fact pack", + "tags": sorted(tags), + } + ) + used_tags.update(tags) + if len(seeds) >= limit: + return seeds + return seeds + + +def _insight_tags(insight: dict[str, Any], fact_meta: dict[str, dict[str, Any]]) -> set[str]: + tags: set[str] = set() + for fid in insight.get("fact_ids") if isinstance(insight.get("fact_ids"), list) else []: + tags.update(fact_meta.get(fid, {}).get("tags") or []) + raw_tags = insight.get("tags") if isinstance(insight.get("tags"), list) else [] + tags.update(t for t in raw_tags if isinstance(t, str)) + summary = insight.get("summary") or insight.get("claim") or "" + if isinstance(summary, str): + tags.update(_fact_line_tags(summary)) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _insight_score( + insight: dict[str, Any], + *, + preference: str, + prefer_tags: set[str], + avoid_tags: set[str], + history_tags: set[str], + fact_meta: dict[str, dict[str, Any]], +) -> float: + base = _score_insight(insight, preference) + tags = _insight_tags(insight, fact_meta) + if prefer_tags and tags: + base += 0.15 * len(tags & prefer_tags) + if avoid_tags and tags: + base -= 0.12 * len(tags & avoid_tags) + if history_tags and tags: + base -= 0.08 * len(tags & history_tags) + if preference == "novelty": + if tags & _DYNAMIC_TAGS: + base += 0.12 + if tags & _INVENTORY_TAGS: + base -= 0.08 + return base + + +def _select_diverse_insights( + candidates: list[dict[str, Any]], + *, + preference: str, + prefer_tags: set[str], + avoid_tags: set[str], + history_tags: set[str], + fact_meta: dict[str, dict[str, Any]], + count: int = 2, +) -> list[dict[str, Any]]: + scored: list[tuple[float, dict[str, Any]]] = [] + for item in candidates: + tags = _insight_tags(item, fact_meta) + item["tags"] = sorted(tags) + score = _insight_score( + item, + preference=preference, + prefer_tags=prefer_tags, + avoid_tags=avoid_tags, + history_tags=history_tags, + fact_meta=fact_meta, + ) + scored.append((score, item)) + scored.sort(key=lambda pair: pair[0], reverse=True) + picked: list[dict[str, Any]] = [] + used_tags: set[str] = set() + for _, item in scored: + tags = set(item.get("tags") or []) + if used_tags and tags and tags <= used_tags and len(picked) < count: + continue + picked.append(item) + used_tags.update(tags) + if len(picked) >= count: + break + if len(picked) < count: + for _, item in scored: + if item in picked: + continue + picked.append(item) + if len(picked) >= count: + break + return picked + + def _open_ended_system() -> str: return ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " "Use ONLY the provided fact pack and recent chat as your evidence. " "You may draw light inferences if you label them as such. " - "Write concise, human sentences, not a list. " + "Write concise, human sentences with a helpful, calm tone (not a list). " "If the question is subjective, share a light opinion grounded in facts. " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible. " @@ -2608,18 +2833,52 @@ def _open_ended_fast( *, fact_pack: str, history_lines: list[str], + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + tags_available: set[str], + history_tags: set[str], state: ThoughtState | None = None, ) -> str: if state: - state.update("synthesizing", step=2) + state.update("planning", step=1) + analysis = _interpret_open_question( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + tags_available=tags_available, + avoid_tags=history_tags, + state=state, + ) + candidates = _select_insights( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + state=state or ThoughtState(), + analysis=analysis, + fact_lines=fact_lines, + fact_meta=fact_meta, + avoid_tags=history_tags, + ) + prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)} + selected = _select_diverse_insights( + candidates, + preference=analysis.get("preference", "balanced"), + prefer_tags=prefer_tags, + avoid_tags=history_tags, + history_tags=history_tags, + fact_meta=fact_meta, + count=2, + ) + if state: + state.update("synthesizing", step=3) synthesis_prompt = ( - "You are given a question and a fact pack. " - "Answer in 2-4 sentences, using only facts from the pack. " - "Pick one or two facts that best fit the question and explain why they matter. " - "If the question is subjective, add a light opinion grounded in those facts. " - "Do not list raw facts; speak naturally. " + "Use the question, fact pack, and selected insights to answer in 2-4 sentences. " + "Speak naturally, not as a list. " + "If the question is subjective, add a light opinion grounded in facts. " + "Avoid repeating the exact same observation as the most recent response if possible. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" - f"Question: {prompt}" + f"Question: {prompt}\n" + f"Selected: {json.dumps(selected, ensure_ascii=False)}" ) context = _append_history_context(fact_pack, history_lines) reply = _ollama_call_safe( @@ -2637,23 +2896,36 @@ def _interpret_open_question( *, fact_pack: str, history_lines: list[str], + tags_available: set[str], + avoid_tags: set[str], + state: ThoughtState | None = None, ) -> dict[str, Any]: + tags_list = ", ".join(sorted(tags_available)) if tags_available else "none" + avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" prompt_text = ( "Analyze the question against the fact pack. " "Return JSON: {\"focus\":\"...\",\"preference\":\"balanced|novelty|utilization|stability|risk\"," - "\"notes\":\"...\"}. " + "\"tags\":[\"...\"] ,\"notes\":\"...\"}. " + "If the question implies interesting/unique/unconventional/cool, set preference to novelty " + "and prefer dynamic tags (utilization/pods/database/availability) when possible. " + f"Use only these tags if relevant: {tags_list}. Avoid tags: {avoid_list}. " "Use only the fact pack." ) context = _append_history_context(fact_pack, history_lines) analysis = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) if not isinstance(analysis, dict): - return {"focus": "cluster snapshot", "preference": "balanced", "notes": ""} + analysis = {"focus": "cluster snapshot", "preference": "balanced", "notes": "", "tags": []} preference = analysis.get("preference") or "balanced" if preference not in ("balanced", "novelty", "utilization", "stability", "risk"): preference = "balanced" analysis["preference"] = preference analysis.setdefault("focus", "cluster snapshot") analysis.setdefault("notes", "") + tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else [] + clean_tags = {t for t in tags if isinstance(t, str)} + analysis["tags"] = sorted(clean_tags & tags_available) + if state: + state.update("planning", step=1, note=str(analysis.get("focus") or "")) return analysis @@ -2663,27 +2935,41 @@ def _select_insights( fact_pack: str, history_lines: list[str], state: ThoughtState, + analysis: dict[str, Any], + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + avoid_tags: set[str], ) -> list[dict[str, Any]]: + preferred_tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else [] + prefer_list = ", ".join(sorted({t for t in preferred_tags if isinstance(t, str)})) + avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" + available_list = ", ".join(sorted({t for t in _ALLOWED_INSIGHT_TAGS})) insight_prompt = ( "From the fact pack, select 3-5 candidate insights that could answer the question. " "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"]," - "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\"}]}. " - "Use only the fact pack." + "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\",\"tags\":[\"...\"]}]}. " + f"Available tags: {available_list}. Prefer tags: {prefer_list or 'none'}. Avoid tags: {avoid_list}. " + "Use only the fact pack and provided tags." ) state.update("drafting candidates", step=2) context = _append_history_context(fact_pack, history_lines) result = _ollama_json_call(insight_prompt + f" Question: {prompt}", context=context) insights = result.get("insights") if isinstance(result, dict) else None if not isinstance(insights, list): - return [] + insights = [] cleaned: list[dict[str, Any]] = [] for item in insights: if not isinstance(item, dict): continue if not item.get("summary") or not item.get("fact_ids"): continue + tags = _insight_tags(item, fact_meta) + item["tags"] = sorted(tags) cleaned.append(item) state.update("drafting candidates", step=2, note=_candidate_note(item)) + seeds = _seed_insights(fact_lines, fact_meta) + for seed in seeds: + cleaned.append(seed) return cleaned @@ -2707,18 +2993,36 @@ def _open_ended_deep( fact_pack: str, fact_ids: set[str], history_lines: list[str], + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + tags_available: set[str], + history_tags: set[str], state: ThoughtState | None = None, ) -> str: state = state or ThoughtState() if not fact_ids: return _ensure_scores("I don't have enough data to answer that.") - state.total_steps = 6 - state.update("planning", step=1) - analysis = _interpret_open_question(prompt, fact_pack=fact_pack, history_lines=history_lines) - state.update("planning", step=1, note=str(analysis.get("focus") or "")) + state.total_steps = 7 + analysis = _interpret_open_question( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + tags_available=tags_available, + avoid_tags=history_tags, + state=state, + ) - candidates = _select_insights(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state) - state.update("verifying", step=3) + candidates = _select_insights( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + state=state, + analysis=analysis, + fact_lines=fact_lines, + fact_meta=fact_meta, + avoid_tags=history_tags, + ) + state.update("verifying", step=3, note="scoring insights") filtered: list[dict[str, Any]] = [] for cand in candidates: cites = cand.get("fact_ids") if isinstance(cand.get("fact_ids"), list) else [] @@ -2729,9 +3033,17 @@ def _open_ended_deep( filtered = candidates preference = analysis.get("preference", "balanced") - ranked = sorted(filtered, key=lambda item: _score_insight(item, preference), reverse=True) - top = ranked[:2] - state.update("synthesizing", step=4) + prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)} + top = _select_diverse_insights( + filtered, + preference=preference, + prefer_tags=prefer_tags, + avoid_tags=history_tags, + history_tags=history_tags, + fact_meta=fact_meta, + count=2, + ) + state.update("synthesizing", step=4, note="composing response") synth_prompt = ( "Use the question, fact pack, and selected insights to craft a concise answer. " "Write 2-4 sentences. Explain why the selected insights stand out. " @@ -2740,6 +3052,7 @@ def _open_ended_deep( "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" f"Question: {prompt}\n" f"Interpretation: {json.dumps(analysis, ensure_ascii=False)}\n" + f"Recent tags: {', '.join(sorted(history_tags)) if history_tags else 'none'}\n" f"Selected: {json.dumps(top, ensure_ascii=False)}" ) context = _append_history_context(fact_pack, history_lines) @@ -2750,7 +3063,7 @@ def _open_ended_deep( fallback="I don't have enough data to answer that.", system_override=_open_ended_system(), ) - state.update("done", step=6) + state.update("done", step=7) return _ensure_scores(reply) @@ -2769,9 +3082,31 @@ def open_ended_answer( return _ensure_scores("I don't have enough data to answer that.") fact_pack = _fact_pack_text(lines) fact_ids = {f"F{i+1}" for i in range(len(lines))} + fact_meta = _fact_pack_meta(lines) + tags_available = {tag for entry in fact_meta.values() for tag in entry.get("tags", [])} + history_tags = _history_tags(history_lines) if mode == "fast": - return _open_ended_fast(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state) - return _open_ended_deep(prompt, fact_pack=fact_pack, fact_ids=fact_ids, history_lines=history_lines, state=state) + return _open_ended_fast( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + fact_lines=lines, + fact_meta=fact_meta, + tags_available=tags_available, + history_tags=history_tags, + state=state, + ) + return _open_ended_deep( + prompt, + fact_pack=fact_pack, + fact_ids=fact_ids, + history_lines=history_lines, + fact_lines=lines, + fact_meta=fact_meta, + tags_available=tags_available, + history_tags=history_tags, + state=state, + ) def _non_cluster_reply(prompt: str) -> str: @@ -2826,9 +3161,9 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): self._write_json(400, {"error": "missing_prompt"}) return cleaned = _strip_bot_mention(prompt) - mode = str(payload.get("mode") or "fast").lower() + mode = str(payload.get("mode") or "deep").lower() if mode not in ("fast", "deep"): - mode = "fast" + mode = "deep" snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() workloads = _snapshot_workloads(snapshot) @@ -2839,11 +3174,12 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): inventory=inventory, workloads=workloads, ) + followup = _is_followup_query(cleaned) cluster_query = ( _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) - or history_cluster or _knowledge_intent(cleaned) or _is_subjective_query(cleaned) + or (history_cluster and followup) ) context = "" if cluster_query: @@ -2857,7 +3193,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): ) fallback = "I don't have enough data to answer that." if cluster_query: - open_ended = _is_subjective_query(cleaned) or _knowledge_intent(cleaned) + open_ended = ( + _is_subjective_query(cleaned) + or _knowledge_intent(cleaned) + or _is_overview_query(cleaned) + ) if open_ended: answer = open_ended_answer( cleaned, @@ -3068,7 +3408,6 @@ def _knowledge_intent(prompt: str) -> bool: "summary", "describe", "explain", - "what is", ) ) @@ -3269,7 +3608,7 @@ def open_ended_with_thinking( ) -> str: result: dict[str, str] = {"reply": ""} done = threading.Event() - total_steps = 2 if mode == "fast" else 6 + total_steps = 4 if mode == "fast" else 7 state = ThoughtState(total_steps=total_steps) def worker(): @@ -3382,11 +3721,12 @@ def sync_loop(token: str, room_id: str): inventory=inventory, workloads=workloads, ) + followup = _is_followup_query(cleaned_body) cluster_query = ( _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) - or history_cluster or _knowledge_intent(cleaned_body) or _is_subjective_query(cleaned_body) + or (history_cluster and followup) ) context = "" if cluster_query: @@ -3407,7 +3747,11 @@ def sync_loop(token: str, room_id: str): fallback = "I don't have enough data to answer that." if cluster_query: - open_ended = _is_subjective_query(cleaned_body) or _knowledge_intent(cleaned_body) + open_ended = ( + _is_subjective_query(cleaned_body) + or _knowledge_intent(cleaned_body) + or _is_overview_query(cleaned_body) + ) if open_ended: reply = open_ended_with_thinking( token, From 7b43e8654f5a820009e1a1723922fe2f159e9467 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:00:19 -0300 Subject: [PATCH 367/416] monitoring: send grafana alerts via postmark --- services/monitoring/helmrelease.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 8e225d49..6185e595 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -339,7 +339,7 @@ spec: GF_AUTH_ANONYMOUS_ORG_NAME: "Overview" GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer" GF_SMTP_ENABLED: "true" - GF_SMTP_HOST: "mail.bstein.dev:587" + GF_SMTP_HOST: "smtp.postmarkapp.com:587" GF_SMTP_FROM: "no-reply-grafana@bstein.dev" GF_SMTP_FROM_NAME: "Atlas Grafana" GRAFANA_ALERT_EMAILS: "brad@bstein.dev" From 1113b1625e6ecfd8dbabcfca70844de3710a390f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:22:50 -0300 Subject: [PATCH 368/416] atlasbot: overhaul open-ended reasoning --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 697 +++++++++--------------- 2 files changed, 253 insertions(+), 446 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 5e5bc05d..17e2cb2f 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-74 + checksum/atlasbot-configmap: manual-atlasbot-75 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 01762934..06685217 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -198,6 +198,8 @@ _INSIGHT_HINT_WORDS = { "unique", "notable", "coolest", + "risk", + "risky", "favorite", "favourite", "trivia", @@ -1641,17 +1643,6 @@ def _hottest_summary_line(metrics: dict[str, Any]) -> str: return "Hot spots: " + "; ".join(parts) + "." -def _is_insight_query(query: str) -> bool: - q = normalize_query(query) - if not q: - return False - if any(word in q for word in _INSIGHT_HINT_WORDS): - return True - if "most" in q and any(word in q for word in ("unusual", "odd", "weird", "unconventional")): - return True - return False - - _FOLLOWUP_HINTS = ( "what about", "how about", @@ -1724,198 +1715,6 @@ def _doc_intent(query: str) -> bool: ) -def _insight_candidates( - inventory: list[dict[str, Any]], - snapshot: dict[str, Any] | None, -) -> list[tuple[str, str, str]]: - metrics = _snapshot_metrics(snapshot) - candidates: list[tuple[str, str, str]] = [] - - nodes_line = _nodes_summary_line(inventory, snapshot) - if nodes_line and "not ready" in nodes_line.lower(): - candidates.append(("availability", nodes_line, "high")) - - hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} - if hottest: - def _hot_node(entry: dict[str, Any]) -> str: - if not isinstance(entry, dict): - return "" - return ( - entry.get("node") - or entry.get("label") - or (entry.get("metric") or {}).get("node") - or "" - ) - - cpu = hottest.get("cpu") if isinstance(hottest.get("cpu"), dict) else {} - cpu_node = _hot_node(cpu) - if cpu_node and cpu.get("value") is not None: - value_fmt = _format_metric_value(str(cpu.get("value")), percent=True) - candidates.append(("cpu", f"The busiest CPU right now is {cpu_node} at about {value_fmt}.", "high")) - ram = hottest.get("ram") if isinstance(hottest.get("ram"), dict) else {} - ram_node = _hot_node(ram) - if ram_node and ram.get("value") is not None: - value_fmt = _format_metric_value(str(ram.get("value")), percent=True) - candidates.append(("ram", f"RAM usage peaks on {ram_node} at about {value_fmt}.", "high")) - - postgres_line = _postgres_summary_line(metrics) - if postgres_line: - candidates.append(("postgres", postgres_line, "high")) - - hardware_insight = _hardware_insight(inventory) - if hardware_insight: - candidates.append(("hardware", hardware_insight, "medium")) - - pods_line = _pods_summary_line(metrics) - if pods_line: - candidates.append(("pods", pods_line, "high")) - - return candidates - - -def _hardware_insight(inventory: list[dict[str, Any]]) -> str: - if not inventory: - return "" - groups = _group_nodes(inventory) - jetsons = groups.get("jetson") or [] - rpi5 = groups.get("rpi5") or [] - rpi4 = groups.get("rpi4") or [] - amd64 = groups.get("amd64") or [] - parts: list[str] = [] - if rpi5: - parts.append(f"rpi5={len(rpi5)}") - if rpi4: - parts.append(f"rpi4={len(rpi4)}") - if jetsons: - jetson_names = ", ".join(jetsons[:2]) - parts.append(f"jetson={len(jetsons)} ({jetson_names})") - if amd64: - parts.append(f"amd64={len(amd64)}") - return ", ".join(parts) - - -def _recent_insight_keys(history_lines: list[str]) -> set[str]: - used: set[str] = set() - for line in history_lines[-10:]: - lower = normalize_query(line) - if not lower: - continue - if "postgres" in lower or "connections" in lower: - used.add("postgres") - if "atlas mixes" in lower or "hardware" in lower or "rpi" in lower or "jetson" in lower: - used.add("hardware") - if "busiest cpu" in lower or "cpu right now" in lower or "cpu " in lower: - used.add("cpu") - if "ram usage" in lower or "memory" in lower: - used.add("ram") - if "pods" in lower: - used.add("pods") - if "not ready" in lower: - used.add("availability") - return used - - -def _select_insight( - prompt: str, - candidates: list[tuple[str, str, str]], - *, - used_keys: set[str] | None = None, -) -> tuple[str, str, str] | None: - if not candidates: - return None - used = used_keys or set() - q = normalize_query(prompt) - prefer_keys: list[str] = [] - if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): - prefer_keys.extend(["hardware", "availability"]) - if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")): - prefer_keys.extend(["hardware", "cpu", "ram"]) - if "interesting" in q and "most interesting" not in q: - prefer_keys.extend(["hardware", "postgres", "cpu", "ram"]) - avoid_used = any(word in q for word in ("another", "else", "different", "other")) or "most interesting" in q - if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1: - for candidate in candidates: - if candidate[0] not in used: - return candidate - return candidates[1] - if prefer_keys: - for prefer in prefer_keys: - for key, text, conf in candidates: - if key == prefer and (not avoid_used or key not in used): - return key, text, conf - for prefer in prefer_keys: - for key, text, conf in candidates: - if key == prefer: - return key, text, conf - if used and avoid_used: - for candidate in candidates: - if candidate[0] not in used: - return candidate - return candidates[0] - - -def _format_insight_text(key: str, text: str) -> str: - cleaned = text.strip().rstrip(".") - if not cleaned: - return "" - if key == "hardware": - counts = ( - cleaned.replace("Hardware mix includes ", "") - .replace("Atlas mixes tiny ", "") - .replace("Atlas mixes ", "") - .replace("which is unusual for a homelab cluster", "") - .strip() - .strip(".") - ) - has_jetson = "jetson=" in counts - has_amd64 = "amd64=" in counts - detail = f"mixed hardware stack ({counts})" - if has_jetson and has_amd64: - flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 boxes." - elif has_jetson: - flavor = "It pairs low-power Pis with Jetson accelerators for edge and AI workloads." - elif has_amd64: - flavor = "It mixes low-power Pis with a couple of heavier AMD64 nodes." - else: - flavor = "It is a pretty uniform hardware stack, which is rare for a homelab." - return f"{detail}. {flavor}" - if key == "postgres": - detail = cleaned.replace("Postgres is at ", "") - return f"Postgres is at {detail}; that feels like healthy, steady load rather than strain." - if key == "pods": - detail = cleaned.replace("There are ", "") - return f"Pods look steady ({detail}); nothing looks stuck or unhealthy." - if key == "availability": - return cleaned + " That is the kind of stability I like to see." - if key in ("cpu", "ram"): - suffix = ( - " If you're chasing hotspots, that's the node I'd watch first." - if key == "cpu" - else " That box is carrying the heaviest memory load right now." - ) - return cleaned + "." + suffix - return cleaned + "." - - -def _insight_prefix(prompt: str) -> str: - q = normalize_query(prompt) - if "coolest" in q: - return "If I had to pick the coolest detail, I'd say " - if "favorite" in q or "favourite" in q: - return "My favorite detail is " - if "trivia" in q: - return "A bit of trivia I like: " - if "most interesting" in q: - return "The most interesting detail to me is " - if any(word in q for word in ("another", "else", "different", "other")): - return "Another interesting detail: " - if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")): - return "What stands out to me is that " - if any(word in q for word in ("interesting", "notable", "fun", "cool")): - return "One thing I'd call out is " - return "" - - def cluster_overview_answer( prompt: str, *, @@ -2784,7 +2583,7 @@ def _open_ended_system() -> str: "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible. " "Do not invent numbers or facts. " - "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100)." + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)." ) @@ -2809,263 +2608,284 @@ def _ollama_call_safe( def _candidate_note(candidate: dict[str, Any]) -> str: - claim = str(candidate.get("claim") or candidate.get("summary") or "") + claim = str(candidate.get("focus") or candidate.get("answer") or "") return claim[:160] + ("…" if len(claim) > 160 else "") def _ensure_scores(answer: str) -> str: text = answer.strip() lines = [line for line in text.splitlines() if line.strip()] - has_relevance = any(line.lower().startswith("relevance:") for line in lines) - has_satisfaction = any(line.lower().startswith("satisfaction:") for line in lines) - has_confidence = any("confidence:" in line.lower() for line in lines) + has_relevance = any(line.lower().startswith("relevance") for line in lines) + has_satisfaction = any(line.lower().startswith("satisfaction") for line in lines) + has_confidence = any(line.lower().startswith("confidence") for line in lines) + has_risk = any(line.lower().startswith("hallucinationrisk") for line in lines) if not has_confidence: lines.append("Confidence: medium") if not has_relevance: lines.append("Relevance: 70") if not has_satisfaction: lines.append("Satisfaction: 70") + if not has_risk: + lines.append("HallucinationRisk: low") return "\n".join(lines) +def _open_ended_plan( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + count: int, + state: ThoughtState | None, +) -> list[dict[str, Any]]: + if state: + state.update("planning", step=1, note="mapping angles") + count = max(1, count) + prompt_text = ( + "Analyze the question and propose up to " + f"{count} distinct answer angles that can be supported by the fact pack. " + "Keep them diverse (e.g., metrics, hardware, workload placement, recent changes). " + "If the question is subjective, propose at least one angle that surfaces a standout detail. " + "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"priority\":1-5}]}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) + angles = result.get("angles") if isinstance(result, dict) else None + cleaned: list[dict[str, Any]] = [] + seen: set[str] = set() + if isinstance(angles, list): + for item in angles: + if not isinstance(item, dict): + continue + focus = str(item.get("focus") or "").strip() + if not focus or focus.lower() in seen: + continue + seen.add(focus.lower()) + priority = item.get("priority") + if not isinstance(priority, (int, float)): + priority = 3 + cleaned.append( + { + "focus": focus, + "reason": str(item.get("reason") or ""), + "priority": int(max(1, min(5, priority))), + } + ) + if not cleaned: + cleaned = [{"focus": "Direct answer", "reason": "Default fallback", "priority": 3}] + cleaned.sort(key=lambda item: item.get("priority", 3), reverse=True) + if state: + state.update("planning", step=1, note=_candidate_note(cleaned[0])) + return cleaned + + +def _normalize_score(value: Any, *, default: int = 60) -> int: + if isinstance(value, (int, float)): + return int(max(0, min(100, value))) + return default + + +def _confidence_score(value: Any) -> int: + text = str(value or "").strip().lower() + if text.startswith("high"): + return 85 + if text.startswith("low"): + return 35 + return 60 + + +def _risk_penalty(value: Any) -> int: + text = str(value or "").strip().lower() + if text.startswith("high"): + return 20 + if text.startswith("medium"): + return 10 + return 0 + + +def _open_ended_candidate( + prompt: str, + *, + focus: str, + fact_pack: str, + history_lines: list[str], + state: ThoughtState | None, + step: int, +) -> dict[str, Any]: + if state: + state.update("drafting", step=step, note=focus) + prompt_text = ( + "Using ONLY the fact pack, answer the question focusing on this angle: " + f"{focus}. " + "Write 2-4 sentences in plain prose (not a list). " + "If you infer, label it as inference. " + "Return JSON: {\"answer\":\"...\",\"confidence\":\"high|medium|low\"," + "\"relevance\":0-100,\"satisfaction\":0-100,\"risk\":\"low|medium|high\"}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) + if not isinstance(result, dict): + result = {} + answer = str(result.get("answer") or "").strip() + if not answer: + answer = "I don't have enough data to answer that from the current snapshot." + candidate = { + "focus": focus, + "answer": answer, + "confidence": result.get("confidence", "medium"), + "relevance": _normalize_score(result.get("relevance"), default=60), + "satisfaction": _normalize_score(result.get("satisfaction"), default=60), + "risk": result.get("risk", "medium"), + } + candidate["score"] = _candidate_score(candidate) + return candidate + + +def _candidate_score(candidate: dict[str, Any]) -> float: + relevance = _normalize_score(candidate.get("relevance"), default=60) + satisfaction = _normalize_score(candidate.get("satisfaction"), default=60) + confidence = _confidence_score(candidate.get("confidence")) + score = relevance * 0.45 + satisfaction * 0.35 + confidence * 0.2 + return score - _risk_penalty(candidate.get("risk")) + + +def _select_candidates(candidates: list[dict[str, Any]], *, count: int) -> list[dict[str, Any]]: + if not candidates: + return [] + ranked = sorted(candidates, key=lambda item: item.get("score", 0), reverse=True) + picked: list[dict[str, Any]] = [] + seen_focus: set[str] = set() + for item in ranked: + focus = str(item.get("focus") or "").strip().lower() + if focus and focus in seen_focus: + continue + picked.append(item) + if focus: + seen_focus.add(focus) + if len(picked) >= count: + break + return picked or ranked[:count] + + +def _open_ended_synthesize( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + candidates: list[dict[str, Any]], + state: ThoughtState | None, + step: int, +) -> str: + if state: + state.update("synthesizing", step=step, note="composing answer") + synth_prompt = ( + "Compose the final answer to the question using the candidate answers below. " + "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. " + "Use only the fact pack as evidence. " + "If you infer, label it as inference. " + "Avoid repeating the last response if possible. " + "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), " + "HallucinationRisk (low|medium|high).\n" + f"Question: {prompt}\n" + f"Candidates: {json.dumps(candidates, ensure_ascii=False)}" + ) + context = _append_history_context(fact_pack, history_lines) + reply = _ollama_call_safe( + ("open", "synth"), + synth_prompt, + context=context, + fallback="I don't have enough data to answer that.", + system_override=_open_ended_system(), + ) + return _ensure_scores(reply) + + +def _open_ended_multi( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + mode: str, + state: ThoughtState | None = None, +) -> str: + angle_count = 2 if mode == "fast" else 4 + total_steps = 1 + angle_count + 2 + if state: + state.total_steps = total_steps + angles = _open_ended_plan( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + count=angle_count, + state=state, + ) + candidates: list[dict[str, Any]] = [] + step = 2 + for angle in angles[:angle_count]: + candidates.append( + _open_ended_candidate( + prompt, + focus=str(angle.get("focus") or "Direct answer"), + fact_pack=fact_pack, + history_lines=history_lines, + state=state, + step=step, + ) + ) + step += 1 + if state: + state.update("evaluating", step=step, note="ranking candidates") + selected = _select_candidates(candidates, count=1 if mode == "fast" else 2) + step += 1 + reply = _open_ended_synthesize( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + candidates=selected or candidates, + state=state, + step=step, + ) + if state: + state.update("done", step=total_steps) + return reply + + +def _open_ended_total_steps(mode: str) -> int: + angle_count = 2 if mode == "fast" else 4 + return 1 + angle_count + 2 + + def _open_ended_fast( prompt: str, *, fact_pack: str, history_lines: list[str], - fact_lines: list[str], - fact_meta: dict[str, dict[str, Any]], - tags_available: set[str], - history_tags: set[str], state: ThoughtState | None = None, ) -> str: - if state: - state.update("planning", step=1) - analysis = _interpret_open_question( + return _open_ended_multi( prompt, fact_pack=fact_pack, history_lines=history_lines, - tags_available=tags_available, - avoid_tags=history_tags, + mode="fast", state=state, ) - candidates = _select_insights( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - state=state or ThoughtState(), - analysis=analysis, - fact_lines=fact_lines, - fact_meta=fact_meta, - avoid_tags=history_tags, - ) - prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)} - selected = _select_diverse_insights( - candidates, - preference=analysis.get("preference", "balanced"), - prefer_tags=prefer_tags, - avoid_tags=history_tags, - history_tags=history_tags, - fact_meta=fact_meta, - count=2, - ) - if state: - state.update("synthesizing", step=3) - synthesis_prompt = ( - "Use the question, fact pack, and selected insights to answer in 2-4 sentences. " - "Speak naturally, not as a list. " - "If the question is subjective, add a light opinion grounded in facts. " - "Avoid repeating the exact same observation as the most recent response if possible. " - "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" - f"Question: {prompt}\n" - f"Selected: {json.dumps(selected, ensure_ascii=False)}" - ) - context = _append_history_context(fact_pack, history_lines) - reply = _ollama_call_safe( - ("fast", "open"), - synthesis_prompt, - context=context, - fallback="I don't have enough data to answer that.", - system_override=_open_ended_system(), - ) - return _ensure_scores(reply) - - -def _interpret_open_question( - prompt: str, - *, - fact_pack: str, - history_lines: list[str], - tags_available: set[str], - avoid_tags: set[str], - state: ThoughtState | None = None, -) -> dict[str, Any]: - tags_list = ", ".join(sorted(tags_available)) if tags_available else "none" - avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" - prompt_text = ( - "Analyze the question against the fact pack. " - "Return JSON: {\"focus\":\"...\",\"preference\":\"balanced|novelty|utilization|stability|risk\"," - "\"tags\":[\"...\"] ,\"notes\":\"...\"}. " - "If the question implies interesting/unique/unconventional/cool, set preference to novelty " - "and prefer dynamic tags (utilization/pods/database/availability) when possible. " - f"Use only these tags if relevant: {tags_list}. Avoid tags: {avoid_list}. " - "Use only the fact pack." - ) - context = _append_history_context(fact_pack, history_lines) - analysis = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) - if not isinstance(analysis, dict): - analysis = {"focus": "cluster snapshot", "preference": "balanced", "notes": "", "tags": []} - preference = analysis.get("preference") or "balanced" - if preference not in ("balanced", "novelty", "utilization", "stability", "risk"): - preference = "balanced" - analysis["preference"] = preference - analysis.setdefault("focus", "cluster snapshot") - analysis.setdefault("notes", "") - tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else [] - clean_tags = {t for t in tags if isinstance(t, str)} - analysis["tags"] = sorted(clean_tags & tags_available) - if state: - state.update("planning", step=1, note=str(analysis.get("focus") or "")) - return analysis - - -def _select_insights( - prompt: str, - *, - fact_pack: str, - history_lines: list[str], - state: ThoughtState, - analysis: dict[str, Any], - fact_lines: list[str], - fact_meta: dict[str, dict[str, Any]], - avoid_tags: set[str], -) -> list[dict[str, Any]]: - preferred_tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else [] - prefer_list = ", ".join(sorted({t for t in preferred_tags if isinstance(t, str)})) - avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" - available_list = ", ".join(sorted({t for t in _ALLOWED_INSIGHT_TAGS})) - insight_prompt = ( - "From the fact pack, select 3-5 candidate insights that could answer the question. " - "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"]," - "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\",\"tags\":[\"...\"]}]}. " - f"Available tags: {available_list}. Prefer tags: {prefer_list or 'none'}. Avoid tags: {avoid_list}. " - "Use only the fact pack and provided tags." - ) - state.update("drafting candidates", step=2) - context = _append_history_context(fact_pack, history_lines) - result = _ollama_json_call(insight_prompt + f" Question: {prompt}", context=context) - insights = result.get("insights") if isinstance(result, dict) else None - if not isinstance(insights, list): - insights = [] - cleaned: list[dict[str, Any]] = [] - for item in insights: - if not isinstance(item, dict): - continue - if not item.get("summary") or not item.get("fact_ids"): - continue - tags = _insight_tags(item, fact_meta) - item["tags"] = sorted(tags) - cleaned.append(item) - state.update("drafting candidates", step=2, note=_candidate_note(item)) - seeds = _seed_insights(fact_lines, fact_meta) - for seed in seeds: - cleaned.append(seed) - return cleaned - - -def _score_insight(insight: dict[str, Any], preference: str) -> float: - relevance = insight.get("relevance") if isinstance(insight.get("relevance"), (int, float)) else 0.0 - novelty = insight.get("novelty") if isinstance(insight.get("novelty"), (int, float)) else 0.0 - if preference == "novelty": - return 0.4 * relevance + 0.6 * novelty - if preference == "utilization": - return 0.7 * relevance + 0.3 * novelty - if preference == "stability": - return 0.7 * relevance + 0.3 * novelty - if preference == "risk": - return 0.6 * relevance + 0.4 * novelty - return 0.6 * relevance + 0.4 * novelty def _open_ended_deep( prompt: str, *, fact_pack: str, - fact_ids: set[str], history_lines: list[str], - fact_lines: list[str], - fact_meta: dict[str, dict[str, Any]], - tags_available: set[str], - history_tags: set[str], state: ThoughtState | None = None, ) -> str: - state = state or ThoughtState() - if not fact_ids: - return _ensure_scores("I don't have enough data to answer that.") - state.total_steps = 7 - analysis = _interpret_open_question( + return _open_ended_multi( prompt, fact_pack=fact_pack, history_lines=history_lines, - tags_available=tags_available, - avoid_tags=history_tags, + mode="deep", state=state, ) - candidates = _select_insights( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - state=state, - analysis=analysis, - fact_lines=fact_lines, - fact_meta=fact_meta, - avoid_tags=history_tags, - ) - state.update("verifying", step=3, note="scoring insights") - filtered: list[dict[str, Any]] = [] - for cand in candidates: - cites = cand.get("fact_ids") if isinstance(cand.get("fact_ids"), list) else [] - if cites and not all(cite in fact_ids for cite in cites): - continue - filtered.append(cand) - if not filtered: - filtered = candidates - - preference = analysis.get("preference", "balanced") - prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)} - top = _select_diverse_insights( - filtered, - preference=preference, - prefer_tags=prefer_tags, - avoid_tags=history_tags, - history_tags=history_tags, - fact_meta=fact_meta, - count=2, - ) - state.update("synthesizing", step=4, note="composing response") - synth_prompt = ( - "Use the question, fact pack, and selected insights to craft a concise answer. " - "Write 2-4 sentences. Explain why the selected insights stand out. " - "If the question is subjective, include a light opinion grounded in facts. " - "Avoid repeating the same observation as the last response if possible. " - "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n" - f"Question: {prompt}\n" - f"Interpretation: {json.dumps(analysis, ensure_ascii=False)}\n" - f"Recent tags: {', '.join(sorted(history_tags)) if history_tags else 'none'}\n" - f"Selected: {json.dumps(top, ensure_ascii=False)}" - ) - context = _append_history_context(fact_pack, history_lines) - reply = _ollama_call_safe( - ("deep", "open"), - synth_prompt, - context=context, - fallback="I don't have enough data to answer that.", - system_override=_open_ended_system(), - ) - state.update("done", step=7) - return _ensure_scores(reply) - def open_ended_answer( prompt: str, @@ -3081,30 +2901,17 @@ def open_ended_answer( if not lines: return _ensure_scores("I don't have enough data to answer that.") fact_pack = _fact_pack_text(lines) - fact_ids = {f"F{i+1}" for i in range(len(lines))} - fact_meta = _fact_pack_meta(lines) - tags_available = {tag for entry in fact_meta.values() for tag in entry.get("tags", [])} - history_tags = _history_tags(history_lines) if mode == "fast": return _open_ended_fast( prompt, fact_pack=fact_pack, history_lines=history_lines, - fact_lines=lines, - fact_meta=fact_meta, - tags_available=tags_available, - history_tags=history_tags, state=state, ) return _open_ended_deep( prompt, fact_pack=fact_pack, - fact_ids=fact_ids, history_lines=history_lines, - fact_lines=lines, - fact_meta=fact_meta, - tags_available=tags_available, - history_tags=history_tags, state=state, ) @@ -3175,12 +2982,12 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): workloads=workloads, ) followup = _is_followup_query(cleaned) - cluster_query = ( - _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) - or _knowledge_intent(cleaned) - or _is_subjective_query(cleaned) - or (history_cluster and followup) - ) + cleaned_q = normalize_query(cleaned) + cluster_affinity = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) + subjective = _is_subjective_query(cleaned) + followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS) + contextual = history_cluster and (followup or followup_affinity) + cluster_query = cluster_affinity or contextual context = "" if cluster_query: context = build_context( @@ -3608,7 +3415,7 @@ def open_ended_with_thinking( ) -> str: result: dict[str, str] = {"reply": ""} done = threading.Event() - total_steps = 4 if mode == "fast" else 7 + total_steps = _open_ended_total_steps(mode) state = ThoughtState(total_steps=total_steps) def worker(): @@ -3722,12 +3529,12 @@ def sync_loop(token: str, room_id: str): workloads=workloads, ) followup = _is_followup_query(cleaned_body) - cluster_query = ( - _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) - or _knowledge_intent(cleaned_body) - or _is_subjective_query(cleaned_body) - or (history_cluster and followup) - ) + cleaned_q = normalize_query(cleaned_body) + cluster_affinity = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) + subjective = _is_subjective_query(cleaned_body) + followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS) + contextual = history_cluster and (followup or followup_affinity) + cluster_query = cluster_affinity or contextual context = "" if cluster_query: context = build_context( From 32884e0b7e43cbda1acaa51fe99faf602f100bf1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:28:37 -0300 Subject: [PATCH 369/416] monitoring: fix grafana smtp from address --- services/monitoring/helmrelease.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 6185e595..78eaf3c4 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -340,7 +340,7 @@ spec: GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer" GF_SMTP_ENABLED: "true" GF_SMTP_HOST: "smtp.postmarkapp.com:587" - GF_SMTP_FROM: "no-reply-grafana@bstein.dev" + GF_SMTP_FROM_ADDRESS: "no-reply-grafana@bstein.dev" GF_SMTP_FROM_NAME: "Atlas Grafana" GRAFANA_ALERT_EMAILS: "brad@bstein.dev" GF_SECURITY_ALLOW_EMBEDDING: "true" From d1611c4f4f31929aab6f3b32b992e7b01fee5162 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:32:25 -0300 Subject: [PATCH 370/416] atlasbot: fix score formatting --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 17e2cb2f..7ad44d4b 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-75 + checksum/atlasbot-configmap: manual-atlasbot-76 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 06685217..9ecd06d9 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2614,7 +2614,7 @@ def _candidate_note(candidate: dict[str, Any]) -> str: def _ensure_scores(answer: str) -> str: text = answer.strip() - lines = [line for line in text.splitlines() if line.strip()] + lines = [line.strip() for line in text.splitlines() if line.strip()] has_relevance = any(line.lower().startswith("relevance") for line in lines) has_satisfaction = any(line.lower().startswith("satisfaction") for line in lines) has_confidence = any(line.lower().startswith("confidence") for line in lines) From 2952b2a7c3448c208e9dff196b39852c8a39801f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:44:49 -0300 Subject: [PATCH 371/416] atlasbot: refine cluster intent handling --- services/comms/scripts/atlasbot/bot.py | 92 ++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 5 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 9ecd06d9..f85b81a0 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -152,6 +152,16 @@ CLUSTER_HINT_WORDS = { "deployment", "daemonset", "statefulset", + "snapshot", + "anomaly", + "anomalies", + "monitor", + "monitoring", + "runbook", + "runbooks", + "documentation", + "docs", + "playbook", "grafana", "victoria", "prometheus", @@ -203,6 +213,12 @@ _INSIGHT_HINT_WORDS = { "favorite", "favourite", "trivia", + "anomaly", + "anomalies", + "monitor", + "monitoring", + "alert", + "alerts", "stand out", "stands out", } @@ -532,7 +548,14 @@ def _humanize_rate(value: str, *, unit: str) -> str: return f"{val:.2f} B/s" def _has_any(text: str, phrases: tuple[str, ...]) -> bool: - return any(p in text for p in phrases) + for phrase in phrases: + if " " in phrase: + if phrase in text: + return True + else: + if re.search(rf"\\b{re.escape(phrase)}\\b", text): + return True + return False def _detect_operation(q: str) -> str | None: if _has_any(q, OPERATION_HINTS["top"]): @@ -552,6 +575,8 @@ def _detect_metric(q: str) -> str | None: part = part.strip() if len(part) >= 2: expanded.add(part) + if part.endswith("s") and len(part) >= 4: + expanded.add(part[:-1]) tokens = expanded for metric, phrases in METRIC_HINTS.items(): for phrase in phrases: @@ -565,6 +590,8 @@ def _detect_metric(q: str) -> str | None: def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]: include: set[str] = set() exclude: set[str] = set() + if any(term in q for term in ("gpu", "gpus", "accelerator", "accelerators", "cuda", "nvidia")): + include.add("jetson") rpi_specific = any( phrase in q for phrase in ( @@ -1287,6 +1314,10 @@ def snapshot_metric_answer( failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") status_terms = ("running", "pending", "failed", "succeeded", "completed") + if "not running" in q or "not in running" in q or "non running" in q: + parts = [v for v in (pending, failed, succeeded) if isinstance(v, (int, float))] + if parts: + return _format_confidence(f"Pods not running: {sum(parts):.0f}.", "high") if sum(1 for term in status_terms if term in q) > 1: parts = [] if running is not None: @@ -1350,6 +1381,8 @@ def structured_answer( op = "top" entity = _detect_entity(q) include_hw, exclude_hw = _detect_hardware_filters(q) + if entity is None and (include_hw or exclude_hw): + entity = "node" nodes_in_query = _extract_titan_nodes(q) only_workers = "worker" in q or "workers" in q role_filters = _detect_role_filters(q) @@ -1385,6 +1418,20 @@ def structured_answer( if hw_line: return _format_confidence(hw_line, "medium") + if ( + entity == "node" + and any(term in q for term in ("arm64", "amd64")) + and any(term in q for term in ("mostly", "majority", "more")) + ): + arm64_count = len([n for n in inventory if n.get("arch") == "arm64"]) + amd64_count = len([n for n in inventory if n.get("arch") == "amd64"]) + if arm64_count or amd64_count: + majority = "arm64" if arm64_count >= amd64_count else "amd64" + return _format_confidence( + f"arm64 nodes: {arm64_count}, amd64 nodes: {amd64_count}. Mostly {majority}.", + "high", + ) + if op == "top" and metric is None and not any(word in q for word in ("hardware", "architecture", "class")): metric = "cpu" @@ -1491,6 +1538,27 @@ def structured_answer( ) if op == "count": + if only_workers and "ready" in q and ("total" in q or "vs" in q or "versus" in q): + total_workers = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=True, + only_ready=None, + nodes_in_query=nodes_in_query, + ) + ready_workers = _inventory_filter( + inventory, + include_hw=include_hw, + exclude_hw=exclude_hw, + only_workers=True, + only_ready=True, + nodes_in_query=nodes_in_query, + ) + return _format_confidence( + f"Worker nodes ready: {len(ready_workers)} / {len(total_workers)} total.", + "high", + ) if expected_workers and ("expected" in q or "should" in q): missing = sorted(set(expected_workers) - {n["name"] for n in inventory}) msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." @@ -1711,6 +1779,15 @@ def _doc_intent(query: str) -> bool: "how to", "instructions", "playbook", + "next step", + "next steps", + "what should", + "what do i", + "what to do", + "troubleshoot", + "triage", + "recover", + "remediate", ) ) @@ -2615,10 +2692,13 @@ def _candidate_note(candidate: dict[str, Any]) -> str: def _ensure_scores(answer: str) -> str: text = answer.strip() lines = [line.strip() for line in text.splitlines() if line.strip()] - has_relevance = any(line.lower().startswith("relevance") for line in lines) - has_satisfaction = any(line.lower().startswith("satisfaction") for line in lines) - has_confidence = any(line.lower().startswith("confidence") for line in lines) - has_risk = any(line.lower().startswith("hallucinationrisk") for line in lines) + def _score_key(line: str) -> str: + cleaned = line.strip().lstrip("-•* ").strip() + return cleaned.lower() + has_relevance = any(_score_key(line).startswith("relevance") for line in lines) + has_satisfaction = any(_score_key(line).startswith("satisfaction") for line in lines) + has_confidence = any(_score_key(line).startswith("confidence") for line in lines) + has_risk = any(_score_key(line).startswith("hallucinationrisk") for line in lines) if not has_confidence: lines.append("Confidence: medium") if not has_relevance: @@ -3004,6 +3084,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): _is_subjective_query(cleaned) or _knowledge_intent(cleaned) or _is_overview_query(cleaned) + or _doc_intent(cleaned) ) if open_ended: answer = open_ended_answer( @@ -3558,6 +3639,7 @@ def sync_loop(token: str, room_id: str): _is_subjective_query(cleaned_body) or _knowledge_intent(cleaned_body) or _is_overview_query(cleaned_body) + or _doc_intent(cleaned_body) ) if open_ended: reply = open_ended_with_thinking( From 269b5bdca80952dcb70e340deb99df8150967688 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:45:17 -0300 Subject: [PATCH 372/416] chore: bump atlasbot config checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7ad44d4b..01aebef8 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-76 + checksum/atlasbot-configmap: manual-atlasbot-77 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From c95a580f84ff8d616f7e76a121082a335aed8953 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:55:00 -0300 Subject: [PATCH 373/416] atlasbot: tighten scoring and readiness logic --- services/comms/scripts/atlasbot/bot.py | 97 +++++++++++++++++++++----- 1 file changed, 81 insertions(+), 16 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f85b81a0..29f53751 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1297,7 +1297,7 @@ def snapshot_metric_answer( parts: list[str] = [] if used is not None and max_conn is not None: free = max_conn - used - if any(word in q for word in ("free", "available", "remaining")): + if any(word in q for word in ("free", "available", "remaining", "remain", "left")): parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max ({free:.0f} free).") else: parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.") @@ -1387,13 +1387,23 @@ def structured_answer( only_workers = "worker" in q or "workers" in q role_filters = _detect_role_filters(q) only_ready: bool | None = None - if "not ready" in q or "unready" in q or "down" in q or "missing" in q: + if ( + "not ready" in q + or "notready" in q + or "not-ready" in q + or "unready" in q + or "down" in q + or "missing" in q + ): only_ready = False elif "ready" in q: only_ready = True if entity == "node" and only_ready is not None and op != "count": op = "status" + if entity == "node" and only_ready is not None and op == "count": + if not any(term in q for term in ("how many", "count", "number")): + op = "status" if not op and entity == "node": op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count" @@ -2692,22 +2702,67 @@ def _candidate_note(candidate: dict[str, Any]) -> str: def _ensure_scores(answer: str) -> str: text = answer.strip() lines = [line.strip() for line in text.splitlines() if line.strip()] + score_map: dict[str, str] = {} + body_lines: list[str] = [] + def _score_key(line: str) -> str: cleaned = line.strip().lstrip("-•* ").strip() return cleaned.lower() - has_relevance = any(_score_key(line).startswith("relevance") for line in lines) - has_satisfaction = any(_score_key(line).startswith("satisfaction") for line in lines) - has_confidence = any(_score_key(line).startswith("confidence") for line in lines) - has_risk = any(_score_key(line).startswith("hallucinationrisk") for line in lines) - if not has_confidence: - lines.append("Confidence: medium") - if not has_relevance: - lines.append("Relevance: 70") - if not has_satisfaction: - lines.append("Satisfaction: 70") - if not has_risk: - lines.append("HallucinationRisk: low") - return "\n".join(lines) + + def _extract_value(line: str) -> str: + cleaned = line.strip().lstrip("-•* ").strip() + if ":" in cleaned: + return cleaned.split(":", 1)[1].strip() + parts = cleaned.split() + return parts[1] if len(parts) > 1 else "" + + def _record_score(key: str, value: str): + if not value: + return + score_map.setdefault(key, value) + + for line in lines: + cleaned = line.strip().lstrip("-•* ").strip() + lowered = cleaned.lower() + if lowered.startswith("confidence,") or ( + "confidence" in lowered and "relevance" in lowered and "satisfaction" in lowered + ): + for key in ("confidence", "relevance", "satisfaction"): + match = re.search(rf"{key}\\s*[:=]?\\s*(\\d{{1,3}}|high|medium|low)", lowered) + if match: + _record_score(key, match.group(1)) + risk_match = re.search(r"hallucination\\s*risk\\s*[:=]?\\s*(low|medium|high)", lowered) + if risk_match: + _record_score("hallucinationrisk", risk_match.group(1)) + continue + if lowered.startswith("confidence"): + _record_score("confidence", _extract_value(cleaned)) + continue + if lowered.startswith("relevance"): + _record_score("relevance", _extract_value(cleaned)) + continue + if lowered.startswith("satisfaction"): + _record_score("satisfaction", _extract_value(cleaned)) + continue + if lowered.replace(" ", "").startswith("hallucinationrisk") or lowered.startswith( + "hallucination risk" + ): + _record_score("hallucinationrisk", _extract_value(cleaned)) + continue + body_lines.append(line) + + confidence = score_map.get("confidence") or "medium" + relevance = score_map.get("relevance") or "70" + satisfaction = score_map.get("satisfaction") or "70" + risk = score_map.get("hallucinationrisk") or "low" + + final_lines = body_lines + [ + f"Confidence: {confidence}", + f"Relevance: {relevance}", + f"Satisfaction: {satisfaction}", + f"HallucinationRisk: {risk}", + ] + return "\n".join(final_lines) def _open_ended_plan( @@ -2799,7 +2854,8 @@ def _open_ended_candidate( f"{focus}. " "Write 2-4 sentences in plain prose (not a list). " "If you infer, label it as inference. " - "Return JSON: {\"answer\":\"...\",\"confidence\":\"high|medium|low\"," + "List which fact pack IDs you used. " + "Return JSON: {\"answer\":\"...\",\"facts_used\":[\"F1\"],\"confidence\":\"high|medium|low\"," "\"relevance\":0-100,\"satisfaction\":0-100,\"risk\":\"low|medium|high\"}." ) context = _append_history_context(fact_pack, history_lines) @@ -2809,9 +2865,13 @@ def _open_ended_candidate( answer = str(result.get("answer") or "").strip() if not answer: answer = "I don't have enough data to answer that from the current snapshot." + facts_used = result.get("facts_used") + if not isinstance(facts_used, list): + facts_used = [] candidate = { "focus": focus, "answer": answer, + "facts_used": facts_used, "confidence": result.get("confidence", "medium"), "relevance": _normalize_score(result.get("relevance"), default=60), "satisfaction": _normalize_score(result.get("satisfaction"), default=60), @@ -2826,6 +2886,8 @@ def _candidate_score(candidate: dict[str, Any]) -> float: satisfaction = _normalize_score(candidate.get("satisfaction"), default=60) confidence = _confidence_score(candidate.get("confidence")) score = relevance * 0.45 + satisfaction * 0.35 + confidence * 0.2 + if not candidate.get("facts_used"): + score -= 5 return score - _risk_penalty(candidate.get("risk")) @@ -2863,6 +2925,9 @@ def _open_ended_synthesize( "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. " "Use only the fact pack as evidence. " "If you infer, label it as inference. " + "Do not claim nodes are missing or not ready unless the fact pack explicitly lists " + "nodes_not_ready or expected_workers_missing. " + "Keep the tone conversational and answer the user's intent directly. " "Avoid repeating the last response if possible. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), " "HallucinationRisk (low|medium|high).\n" From 413b9eca5d4490b9cee0968410b4d0ed7aea69a4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 22:55:24 -0300 Subject: [PATCH 374/416] chore: bump atlasbot checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 01aebef8..a06e6283 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-77 + checksum/atlasbot-configmap: manual-atlasbot-78 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 4651133debeb0c5920206f245fb6c9513de966e1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:01:51 -0300 Subject: [PATCH 375/416] atlasbot: fix word boundary detection --- services/comms/scripts/atlasbot/bot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 29f53751..77868f1f 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -553,7 +553,7 @@ def _has_any(text: str, phrases: tuple[str, ...]) -> bool: if phrase in text: return True else: - if re.search(rf"\\b{re.escape(phrase)}\\b", text): + if re.search(rf"\b{re.escape(phrase)}\b", text): return True return False From b16f841e9aac45bbe79edd7824042a29690a42d2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:02:22 -0300 Subject: [PATCH 376/416] chore: bump atlasbot checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index a06e6283..530fb407 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-78 + checksum/atlasbot-configmap: manual-atlasbot-79 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From d9951083ee24475caab9ab9094f00002f76717a5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:16:53 -0300 Subject: [PATCH 377/416] atlasbot: improve metric detection and counts --- services/comms/scripts/atlasbot/bot.py | 81 +++++++++++++++++++++----- 1 file changed, 68 insertions(+), 13 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 77868f1f..eca5fef9 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -120,6 +120,7 @@ OPERATION_HINTS = { "count": ("how many", "count", "number", "total"), "list": ("list", "which", "what are", "show", "names"), "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum", "busiest", "busy"), + "bottom": ("lowest", "least", "minimum", "min", "smallest"), "status": ("ready", "not ready", "unready", "down", "missing", "status"), } @@ -568,6 +569,14 @@ def _detect_operation(q: str) -> str | None: return None def _detect_metric(q: str) -> str | None: + q = normalize_query(q) + if _has_any(q, ("disk", "storage")): + return "io" + if _has_any(q, ("io",)) and not _has_any(q, METRIC_HINTS["net"]): + return "io" + for metric, phrases in METRIC_HINTS.items(): + if _has_any(q, phrases): + return metric tokens = set(_tokens(q)) expanded: set[str] = set(tokens) for token in list(tokens): @@ -1237,6 +1246,34 @@ def _node_usage_top( return None +def _node_usage_bottom( + usage: list[dict[str, Any]], + *, + allowed_nodes: set[str] | None, +) -> tuple[str, float] | None: + best_node: str | None = None + best_val: float | None = None + for item in usage: + if not isinstance(item, dict): + continue + node = item.get("node") + if not node or not isinstance(node, str): + continue + if allowed_nodes and node not in allowed_nodes: + continue + value = item.get("value") + try: + numeric = float(value) + except (TypeError, ValueError): + continue + if best_val is None or numeric < best_val: + best_val = numeric + best_node = node + if best_node and best_val is not None: + return best_node, best_val + return None + + def snapshot_metric_answer( prompt: str, *, @@ -1267,18 +1304,20 @@ def snapshot_metric_answer( ) allowed_nodes = {node["name"] for node in filtered} if filtered else None - if metric in {"cpu", "ram", "net", "io"} and op in {"top", "status", None}: + if metric in {"cpu", "ram", "net", "io"} and op in {"top", "bottom", "status", None}: usage = metrics.get("node_usage", {}).get(metric, []) - top = _node_usage_top(usage, allowed_nodes=allowed_nodes) - if top: - node, val = top + pick = _node_usage_bottom if op == "bottom" else _node_usage_top + chosen = pick(usage, allowed_nodes=allowed_nodes) + if chosen: + node, val = chosen percent = metric in {"cpu", "ram"} value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"}) scope = "" if include_hw: scope = f" among {' and '.join(sorted(include_hw))}" - answer = f"Hottest node{scope}: {node} ({value})." - if allowed_nodes and len(allowed_nodes) != len(inventory): + label = "Lowest" if op == "bottom" else "Hottest" + answer = f"{label} node{scope}: {node} ({value})." + if allowed_nodes and len(allowed_nodes) != len(inventory) and op != "bottom": overall = _node_usage_top(usage, allowed_nodes=None) if overall and overall[0] != node: overall_val = _format_metric_value( @@ -1314,6 +1353,10 @@ def snapshot_metric_answer( failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") status_terms = ("running", "pending", "failed", "succeeded", "completed") + if "total" in q or "sum" in q: + values = [v for v in (running, pending, failed, succeeded) if isinstance(v, (int, float))] + if values: + return _format_confidence(f"Total pods: {sum(values):.0f}.", "high") if "not running" in q or "not in running" in q or "non running" in q: parts = [v for v in (pending, failed, succeeded) if isinstance(v, (int, float))] if parts: @@ -1468,7 +1511,8 @@ def structured_answer( node, val = _primary_series_metric(res) if node and val is not None: percent = _metric_expr_uses_percent(entry) - value_fmt = _format_metric_value(val or "", percent=percent) + rate = metric in {"net", "io"} + value_fmt = _format_metric_value(val or "", percent=percent, rate=rate) metric_label = (metric or "").upper() label = f"{metric_label} node" if metric_label else "node" answer = f"Hottest {label}: {node} ({value_fmt})." @@ -1495,7 +1539,8 @@ def structured_answer( scoped_node, scoped_val = _primary_series_metric(res) if base_node and scoped_node and base_node != scoped_node: percent = _metric_expr_uses_percent(entry) - base_val_fmt = _format_metric_value(base_val or "", percent=percent) + rate = metric in {"net", "io"} + base_val_fmt = _format_metric_value(base_val or "", percent=percent, rate=rate) overall_note = f" Overall hottest node: {base_node} ({base_val_fmt})." return _format_confidence(f"Among {scope} nodes, {answer}{overall_note}", "high") return _format_confidence(answer, "high") @@ -1525,9 +1570,14 @@ def structured_answer( names = [node["name"] for node in filtered] if op == "status": + scope_label = "nodes" + if include_hw: + scope_label = f"{' and '.join(sorted(include_hw))} nodes" + elif only_workers: + scope_label = "worker nodes" if "missing" in q and ("ready" in q or "readiness" in q): return _format_confidence( - "Not ready nodes: " + (", ".join(names) if names else "none") + ".", + f"Not ready {scope_label}: " + (", ".join(names) if names else "none") + ".", "high", ) if "missing" in q and expected_workers: @@ -1538,16 +1588,21 @@ def structured_answer( ) if only_ready is False: return _format_confidence( - "Not ready nodes: " + (", ".join(names) if names else "none") + ".", + f"Not ready {scope_label}: " + (", ".join(names) if names else "none") + ".", "high", ) if only_ready is True: return _format_confidence( - f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + ".", + f"Ready {scope_label} ({len(names)}): " + (", ".join(names) if names else "none") + ".", "high", ) if op == "count": + scope_label = "nodes" + if include_hw: + scope_label = f"{' and '.join(sorted(include_hw))} nodes" + elif only_workers: + scope_label = "worker nodes" if only_workers and "ready" in q and ("total" in q or "vs" in q or "versus" in q): total_workers = _inventory_filter( inventory, @@ -1576,9 +1631,9 @@ def structured_answer( msg += f" Missing: {', '.join(missing)}." return _format_confidence(msg, "high") if only_ready is True: - return _format_confidence(f"Ready nodes: {len(names)}.", "high") + return _format_confidence(f"Ready {scope_label}: {len(names)}.", "high") if only_ready is False: - return _format_confidence(f"Not ready nodes: {len(names)}.", "high") + return _format_confidence(f"Not ready {scope_label}: {len(names)}.", "high") if not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters): return _format_confidence(f"Atlas has {len(names)} nodes.", "high") return _format_confidence(f"Matching nodes: {len(names)}.", "high") From 8a22e8e0d8d3dee24068dbff9f507f4d23da5ac6 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:17:23 -0300 Subject: [PATCH 378/416] chore: bump atlasbot checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 530fb407..94eeea70 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-79 + checksum/atlasbot-configmap: manual-atlasbot-80 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 9a978c5e727b94ccbf81024ba8926f7b5c244353 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:23:42 -0300 Subject: [PATCH 379/416] monitoring: tune cpu and maintenance alerts --- services/monitoring/grafana-alerting-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index 8713d3db..d97db150 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -145,7 +145,7 @@ data: model: intervalMs: 60000 maxDataPoints: 43200 - expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] + expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\") legendFormat: '{{instance}}' datasource: type: prometheus @@ -175,9 +175,9 @@ data: type: last type: query noDataState: NoData - execErrState: Error + execErrState: NoData annotations: - summary: "{{ $labels.instance }} CPU >90% for 10m" + summary: "{{ $labels.node }} CPU >90% for 10m" labels: severity: warning - orgId: 1 @@ -297,7 +297,7 @@ data: to: 0 datasourceUid: atlas-vm model: - expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) + expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0) intervalMs: 60000 maxDataPoints: 43200 legendFormat: '{{cronjob}}' From 35396d19ea338cb4ec88b6b8a22bcb8c5266b8d4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:24:12 -0300 Subject: [PATCH 380/416] atlasbot: fix bottom ops and pod queries --- services/comms/scripts/atlasbot/bot.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index eca5fef9..7f22ad57 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -163,6 +163,8 @@ CLUSTER_HINT_WORDS = { "documentation", "docs", "playbook", + "utilization", + "usage", "grafana", "victoria", "prometheus", @@ -561,8 +563,10 @@ def _has_any(text: str, phrases: tuple[str, ...]) -> bool: def _detect_operation(q: str) -> str | None: if _has_any(q, OPERATION_HINTS["top"]): return "top" + if _has_any(q, OPERATION_HINTS["bottom"]): + return "bottom" for op, phrases in OPERATION_HINTS.items(): - if op == "top": + if op in ("top", "bottom"): continue if _has_any(q, phrases): return op @@ -1353,6 +1357,11 @@ def snapshot_metric_answer( failed = metrics.get("pods_failed") succeeded = metrics.get("pods_succeeded") status_terms = ("running", "pending", "failed", "succeeded", "completed") + if ("most pods" in q or ("most" in q and "pod" in q and "node" in q)) and not nodes_in_query: + return _format_confidence( + "I don't have per-node pod counts in the snapshot.", + "medium", + ) if "total" in q or "sum" in q: values = [v for v in (running, pending, failed, succeeded) if isinstance(v, (int, float))] if values: @@ -1363,13 +1372,13 @@ def snapshot_metric_answer( return _format_confidence(f"Pods not running: {sum(parts):.0f}.", "high") if sum(1 for term in status_terms if term in q) > 1: parts = [] - if running is not None: + if "running" in q and running is not None: parts.append(f"running {running:.0f}") - if pending is not None: + if "pending" in q and pending is not None: parts.append(f"pending {pending:.0f}") - if failed is not None: + if "failed" in q and failed is not None: parts.append(f"failed {failed:.0f}") - if succeeded is not None: + if ("succeeded" in q or "completed" in q) and succeeded is not None: parts.append(f"succeeded {succeeded:.0f}") if parts: return _format_confidence(f"Pods: {', '.join(parts)}.", "high") @@ -1461,7 +1470,12 @@ def structured_answer( if hw_line: return _format_confidence(hw_line, "high") - if entity == "node" and op == "status" and metric is None: + if ( + entity == "node" + and op == "status" + and metric is None + and not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters) + ): summary = _nodes_summary_line(inventory, snapshot) if summary: return _format_confidence(summary, "high") From fa9184bc9107089ff7c949fb8b28558b3b0b9378 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:24:46 -0300 Subject: [PATCH 381/416] chore: bump atlasbot checksum --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 94eeea70..6761287b 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-80 + checksum/atlasbot-configmap: manual-atlasbot-81 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From a49fa6dd33f9dcab3ac3e4522d6f73870c0609d8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:29:46 -0300 Subject: [PATCH 382/416] monitoring: restart grafana for alerting reload --- services/monitoring/helmrelease.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 78eaf3c4..66517389 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -286,6 +286,7 @@ spec: podAnnotations: vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "monitoring" + monitoring.bstein.dev/restart-rev: "1" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" vault.hashicorp.com/agent-inject-template-grafana-env.sh: | {{ with secret "kv/data/atlas/monitoring/grafana-admin" }} From 35d5d5a1a30774c655f5390337a3b6445fd9e958 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:34:11 -0300 Subject: [PATCH 383/416] monitoring: fix grafana alert exec state --- services/monitoring/grafana-alerting-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index d97db150..33ac7396 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -175,7 +175,7 @@ data: type: last type: query noDataState: NoData - execErrState: NoData + execErrState: OK annotations: summary: "{{ $labels.node }} CPU >90% for 10m" labels: From 3bd42c93d63959dd2e2569627cc2d2eb90744f54 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:45:08 -0300 Subject: [PATCH 384/416] atlasbot: overhaul reasoning pipeline --- services/comms/atlasbot-deployment.yaml | 6 +- services/comms/scripts/atlasbot/bot.py | 405 +++++++++++++++++++----- 2 files changed, 336 insertions(+), 75 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 6761287b..b08f20db 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-81 + checksum/atlasbot-configmap: manual-atlasbot-82 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -83,6 +83,10 @@ spec: value: http://ollama.ai.svc.cluster.local:11434 - name: OLLAMA_MODEL value: qwen2.5:14b-instruct + - name: ATLASBOT_MODEL_FAST + value: qwen2.5:14b-instruct + - name: ATLASBOT_MODEL_DEEP + value: qwen2.5:14b-instruct - name: OLLAMA_FALLBACK_MODEL value: qwen2.5:14b-instruct-q4_0 - name: OLLAMA_TIMEOUT_SEC diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7f22ad57..7e6341e6 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -17,6 +17,8 @@ ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") +MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "") +MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "") FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "") API_KEY = os.environ.get("CHAT_API_KEY", "") OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) @@ -372,6 +374,14 @@ def _detect_mode_from_body(body: str, *, default: str = "deep") -> str: return default +def _model_for_mode(mode: str) -> str: + if mode == "fast" and MODEL_FAST: + return MODEL_FAST + if mode == "deep" and MODEL_DEEP: + return MODEL_DEEP + return MODEL + + # Matrix HTTP helper. def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None): url = (base or BASE) + path @@ -2487,7 +2497,13 @@ class ThoughtState: return f"Still thinking ({detail})." -def _ollama_json_call(prompt: str, *, context: str, retries: int = 2) -> dict[str, Any]: +def _ollama_json_call( + prompt: str, + *, + context: str, + retries: int = 2, + model: str | None = None, +) -> dict[str, Any]: system = ( "System: You are Atlas, a reasoning assistant. " "Return strict JSON only (no code fences, no trailing commentary). " @@ -2504,6 +2520,7 @@ def _ollama_json_call(prompt: str, *, context: str, retries: int = 2) -> dict[st context=context, use_history=False, system_override=system, + model=model, ) cleaned = _strip_code_fence(raw).strip() if cleaned.startswith("{") and cleaned.endswith("}"): @@ -2547,6 +2564,19 @@ def _fact_pack_text(lines: list[str]) -> str: return "Fact pack:\n" + "\n".join(labeled) +def _tool_fact_lines(prompt: str, *, allow_tools: bool) -> list[str]: + if not allow_tools: + return [] + metrics_context, _ = metrics_query_context(prompt, allow_tools=True) + lines: list[str] = [] + if metrics_context: + for line in metrics_context.splitlines(): + trimmed = line.strip() + if trimmed: + lines.append(f"tool_metrics: {trimmed}") + return lines + + _ALLOWED_INSIGHT_TAGS = { "availability", "architecture", @@ -2607,6 +2637,15 @@ def _history_tags(history_lines: list[str]) -> set[str]: return tags & _ALLOWED_INSIGHT_TAGS +def _normalize_fraction(value: Any, *, default: float = 0.5) -> float: + if isinstance(value, (int, float)): + score = float(value) + if score > 1: + score = score / 100.0 + return max(0.0, min(1.0, score)) + return default + + def _seed_insights( lines: list[str], fact_meta: dict[str, dict[str, Any]], @@ -2735,9 +2774,9 @@ def _open_ended_system() -> str: "Use ONLY the provided fact pack and recent chat as your evidence. " "You may draw light inferences if you label them as such. " "Write concise, human sentences with a helpful, calm tone (not a list). " - "If the question is subjective, share a light opinion grounded in facts. " + "If the question is subjective, share a light opinion grounded in facts and explain why it stands out. " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " - "Avoid repeating the exact same observation as the last response if possible. " + "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " "Do not invent numbers or facts. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)." ) @@ -2750,6 +2789,7 @@ def _ollama_call_safe( context: str, fallback: str, system_override: str | None = None, + model: str | None = None, ) -> str: try: return _ollama_call( @@ -2758,6 +2798,7 @@ def _ollama_call_safe( context=context, use_history=False, system_override=system_override, + model=model, ) except Exception: return fallback @@ -2841,6 +2882,7 @@ def _open_ended_plan( history_lines: list[str], count: int, state: ThoughtState | None, + model: str | None, ) -> list[dict[str, Any]]: if state: state.update("planning", step=1, note="mapping angles") @@ -2850,10 +2892,15 @@ def _open_ended_plan( f"{count} distinct answer angles that can be supported by the fact pack. " "Keep them diverse (e.g., metrics, hardware, workload placement, recent changes). " "If the question is subjective, propose at least one angle that surfaces a standout detail. " + "Avoid repeating the same angle as the most recent response if possible. " "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"priority\":1-5}]}." ) context = _append_history_context(fact_pack, history_lines) - result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) angles = result.get("angles") if isinstance(result, dict) else None cleaned: list[dict[str, Any]] = [] seen: set[str] = set() @@ -2883,6 +2930,81 @@ def _open_ended_plan( return cleaned +def _preferred_tags_for_prompt(prompt: str) -> set[str]: + q = normalize_query(prompt) + tags: set[str] = set() + if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")): + tags.add("utilization") + if any(word in q for word in ("postgres", "database", "db", "connections")): + tags.add("database") + if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")): + tags.add("pods") + if any(word in q for word in ("workload", "service", "namespace")): + tags.add("workloads") + if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")): + tags.add("availability") + if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")): + tags.update({"hardware", "inventory", "architecture"}) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _open_ended_insights( + prompt: str, + *, + fact_pack: str, + fact_meta: dict[str, dict[str, Any]], + history_lines: list[str], + count: int, + state: ThoughtState | None, + model: str | None, +) -> list[dict[str, Any]]: + if state: + state.update("analyzing", note="scouting insights") + count = max(1, count) + allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS)) + prompt_text = ( + "Review the fact pack and propose up to " + f"{count} insights that could answer the question. " + "Each insight should be grounded in the facts. " + "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"]," + "\"relevance\":0-1,\"novelty\":0-1,\"tags\":[\"tag\"],\"rationale\":\"...\"}]}. " + f"Only use tags from: {allowed_tags}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) + insights = result.get("insights") if isinstance(result, dict) else None + cleaned: list[dict[str, Any]] = [] + valid_ids = set(fact_meta.keys()) + if isinstance(insights, list): + for item in insights: + if not isinstance(item, dict): + continue + summary = str(item.get("summary") or item.get("claim") or "").strip() + if not summary: + continue + raw_ids = item.get("fact_ids") if isinstance(item.get("fact_ids"), list) else [] + fact_ids = [fid for fid in raw_ids if isinstance(fid, str) and fid in valid_ids] + if not fact_ids: + continue + cleaned.append( + { + "summary": summary, + "fact_ids": fact_ids, + "relevance": _normalize_fraction(item.get("relevance"), default=0.6), + "novelty": _normalize_fraction(item.get("novelty"), default=0.5), + "rationale": str(item.get("rationale") or ""), + "tags": [t for t in (item.get("tags") or []) if isinstance(t, str)], + } + ) + if cleaned and state: + state.update("analyzing", note=_candidate_note(cleaned[0])) + return cleaned + + def _normalize_score(value: Any, *, default: int = 60) -> int: if isinstance(value, (int, float)): return int(max(0, min(100, value))) @@ -2915,20 +3037,31 @@ def _open_ended_candidate( history_lines: list[str], state: ThoughtState | None, step: int, + fact_hints: list[str] | None = None, + model: str | None = None, ) -> dict[str, Any]: if state: state.update("drafting", step=step, note=focus) + hint_text = "" + if fact_hints: + hint_text = " Prioritize these fact IDs if relevant: " + ", ".join(fact_hints) + "." prompt_text = ( "Using ONLY the fact pack, answer the question focusing on this angle: " f"{focus}. " - "Write 2-4 sentences in plain prose (not a list). " + "Write 2-4 sentences in plain prose (not a list)." + + hint_text + + " " "If you infer, label it as inference. " "List which fact pack IDs you used. " "Return JSON: {\"answer\":\"...\",\"facts_used\":[\"F1\"],\"confidence\":\"high|medium|low\"," "\"relevance\":0-100,\"satisfaction\":0-100,\"risk\":\"low|medium|high\"}." ) context = _append_history_context(fact_pack, history_lines) - result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) if not isinstance(result, dict): result = {} answer = str(result.get("answer") or "").strip() @@ -2986,9 +3119,12 @@ def _open_ended_synthesize( candidates: list[dict[str, Any]], state: ThoughtState | None, step: int, + model: str | None, + critique: str | None = None, ) -> str: if state: state.update("synthesizing", step=step, note="composing answer") + critique_block = f"\nCritique guidance: {critique}\n" if critique else "\n" synth_prompt = ( "Compose the final answer to the question using the candidate answers below. " "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. " @@ -3001,6 +3137,7 @@ def _open_ended_synthesize( "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), " "HallucinationRisk (low|medium|high).\n" f"Question: {prompt}\n" + f"{critique_block}" f"Candidates: {json.dumps(candidates, ensure_ascii=False)}" ) context = _append_history_context(fact_pack, history_lines) @@ -3010,20 +3147,55 @@ def _open_ended_synthesize( context=context, fallback="I don't have enough data to answer that.", system_override=_open_ended_system(), + model=model, ) return _ensure_scores(reply) +def _open_ended_critique( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + candidates: list[dict[str, Any]], + state: ThoughtState | None, + step: int, + model: str | None, +) -> str: + if state: + state.update("reviewing", step=step, note="quality check") + critique_prompt = ( + "Review the candidate answers against the fact pack. " + "Identify any missing important detail or risky inference and give one sentence of guidance. " + "Return JSON: {\"guidance\":\"...\",\"risk\":\"low|medium|high\"}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + critique_prompt + f" Question: {prompt} Candidates: {json.dumps(candidates, ensure_ascii=False)}", + context=context, + model=model, + ) + if isinstance(result, dict): + guidance = str(result.get("guidance") or "").strip() + if guidance: + return guidance + return "" + + def _open_ended_multi( prompt: str, *, fact_pack: str, + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], history_lines: list[str], mode: str, state: ThoughtState | None = None, ) -> str: + model = _model_for_mode(mode) angle_count = 2 if mode == "fast" else 4 - total_steps = 1 + angle_count + 2 + insight_count = 2 if mode == "fast" else 4 + total_steps = 2 + angle_count + 2 + (1 if mode == "deep" else 0) if state: state.total_steps = total_steps angles = _open_ended_plan( @@ -3032,10 +3204,57 @@ def _open_ended_multi( history_lines=history_lines, count=angle_count, state=state, + model=model, ) + insights = _open_ended_insights( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + count=insight_count, + state=state, + model=model, + ) + seeds = _seed_insights(fact_lines, fact_meta, limit=max(4, insight_count)) + insight_candidates = insights + seeds + subjective = _is_subjective_query(prompt) + prefer_tags = _preferred_tags_for_prompt(prompt) + history_tags = _history_tags(history_lines) + avoid_tags = history_tags if subjective else set() + preference = "novelty" if subjective else "relevance" + selected_insights = _select_diverse_insights( + insight_candidates, + preference=preference, + prefer_tags=prefer_tags, + avoid_tags=avoid_tags, + history_tags=history_tags, + fact_meta=fact_meta, + count=1 if mode == "fast" else 2, + ) + if state and selected_insights: + state.update("analyzing", note=_candidate_note(selected_insights[0])) + + angle_inputs: list[dict[str, Any]] = [] + for insight in selected_insights: + angle_inputs.append( + { + "focus": str(insight.get("summary") or "Direct answer"), + "fact_ids": insight.get("fact_ids") or [], + } + ) + for angle in angles: + if len(angle_inputs) >= angle_count: + break + angle_inputs.append( + { + "focus": str(angle.get("focus") or "Direct answer"), + "fact_ids": [], + } + ) + candidates: list[dict[str, Any]] = [] - step = 2 - for angle in angles[:angle_count]: + step = 3 + for angle in angle_inputs[:angle_count]: candidates.append( _open_ended_candidate( prompt, @@ -3044,6 +3263,8 @@ def _open_ended_multi( history_lines=history_lines, state=state, step=step, + fact_hints=angle.get("fact_ids") if isinstance(angle.get("fact_ids"), list) else None, + model=model, ) ) step += 1 @@ -3051,6 +3272,18 @@ def _open_ended_multi( state.update("evaluating", step=step, note="ranking candidates") selected = _select_candidates(candidates, count=1 if mode == "fast" else 2) step += 1 + critique = "" + if mode == "deep": + critique = _open_ended_critique( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + candidates=selected or candidates, + state=state, + step=step, + model=model, + ) + step += 1 reply = _open_ended_synthesize( prompt, fact_pack=fact_pack, @@ -3058,6 +3291,8 @@ def _open_ended_multi( candidates=selected or candidates, state=state, step=step, + model=model, + critique=critique, ) if state: state.update("done", step=total_steps) @@ -3066,19 +3301,23 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: angle_count = 2 if mode == "fast" else 4 - return 1 + angle_count + 2 + return 2 + angle_count + 2 + (1 if mode == "deep" else 0) def _open_ended_fast( prompt: str, *, fact_pack: str, + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], history_lines: list[str], state: ThoughtState | None = None, ) -> str: return _open_ended_multi( prompt, fact_pack=fact_pack, + fact_lines=fact_lines, + fact_meta=fact_meta, history_lines=history_lines, mode="fast", state=state, @@ -3089,12 +3328,16 @@ def _open_ended_deep( prompt: str, *, fact_pack: str, + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], history_lines: list[str], state: ThoughtState | None = None, ) -> str: return _open_ended_multi( prompt, fact_pack=fact_pack, + fact_lines=fact_lines, + fact_meta=fact_meta, history_lines=history_lines, mode="deep", state=state, @@ -3109,31 +3352,61 @@ def open_ended_answer( workloads: list[dict[str, Any]], history_lines: list[str], mode: str, + allow_tools: bool, state: ThoughtState | None = None, ) -> str: lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads) + if _knowledge_intent(prompt) or _doc_intent(prompt): + kb_detail = kb_retrieve(prompt) + if kb_detail: + for line in kb_detail.splitlines(): + if line.strip(): + lines.append(line.strip()) + tool_lines = _tool_fact_lines(prompt, allow_tools=allow_tools) + if tool_lines: + lines.extend(tool_lines) if not lines: return _ensure_scores("I don't have enough data to answer that.") fact_pack = _fact_pack_text(lines) + fact_meta = _fact_pack_meta(lines) if mode == "fast": return _open_ended_fast( prompt, fact_pack=fact_pack, + fact_lines=lines, + fact_meta=fact_meta, history_lines=history_lines, state=state, ) return _open_ended_deep( prompt, fact_pack=fact_pack, + fact_lines=lines, + fact_meta=fact_meta, history_lines=history_lines, state=state, ) -def _non_cluster_reply(prompt: str) -> str: - return _ensure_scores( - "I focus on the Atlas/Othrys cluster and don't have enough data to answer that." +def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> str: + system = ( + "System: You are Atlas, a helpful general assistant. " + "Answer using common knowledge when possible, and say when you're unsure. " + "Be concise and avoid unnecessary caveats. " + "Respond in plain sentences (no lists unless asked). " + "End every response with a line: 'Confidence: high|medium|low'." ) + model = _model_for_mode(mode) + context = _append_history_context("", history_lines) if history_lines else "" + reply = _ollama_call( + ("general", "reply"), + prompt, + context=context, + use_history=False, + system_override=system, + model=model, + ) + return _ensure_scores(reply) # Internal HTTP endpoint for cluster answers (website uses this). @@ -3183,7 +3456,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): return cleaned = _strip_bot_mention(prompt) mode = str(payload.get("mode") or "deep").lower() - if mode not in ("fast", "deep"): + if mode in ("quick", "fast"): + mode = "fast" + elif mode in ("smart", "deep"): + mode = "deep" + else: mode = "deep" snapshot = _snapshot_state() inventory = _snapshot_inventory(snapshot) or node_inventory_live() @@ -3212,37 +3489,19 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): snapshot=snapshot, workloads=workloads, ) - fallback = "I don't have enough data to answer that." if cluster_query: - open_ended = ( - _is_subjective_query(cleaned) - or _knowledge_intent(cleaned) - or _is_overview_query(cleaned) - or _doc_intent(cleaned) + answer = open_ended_answer( + cleaned, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history_lines, + mode=mode, + allow_tools=False, + state=None, ) - if open_ended: - answer = open_ended_answer( - cleaned, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history_lines, - mode=mode, - state=None, - ) - else: - answer = ( - cluster_answer( - cleaned, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history_lines, - ) - or fallback - ) else: - answer = _non_cluster_reply(cleaned) + answer = _non_cluster_reply(cleaned, history_lines=history_lines, mode=mode) self._write_json(200, {"answer": answer}) @@ -3490,6 +3749,7 @@ def _ollama_call( context: str, use_history: bool = True, system_override: str | None = None, + model: str | None = None, ) -> str: system = system_override or ( "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. " @@ -3521,7 +3781,8 @@ def _ollama_call( messages.extend(_history_to_messages(history[hist_key][-24:])) messages.append({"role": "user", "content": prompt}) - payload = {"model": MODEL, "messages": messages, "stream": False} + model_name = model or MODEL + payload = {"model": model_name, "messages": messages, "stream": False} headers = {"Content-Type": "application/json"} if API_KEY: headers["x-api-key"] = API_KEY @@ -3561,11 +3822,18 @@ def ollama_reply( context: str, fallback: str = "", use_history: bool = True, + model: str | None = None, ) -> str: last_error = None for attempt in range(max(1, OLLAMA_RETRIES + 1)): try: - return _ollama_call(hist_key, prompt, context=context, use_history=use_history) + return _ollama_call( + hist_key, + prompt, + context=context, + use_history=use_history, + model=model, + ) except Exception as exc: # noqa: BLE001 last_error = exc time.sleep(min(4, 2 ** attempt)) @@ -3584,6 +3852,7 @@ def ollama_reply_with_thinking( context: str, fallback: str, use_history: bool = True, + model: str | None = None, ) -> str: result: dict[str, str] = {"reply": ""} done = threading.Event() @@ -3595,6 +3864,7 @@ def ollama_reply_with_thinking( context=context, fallback=fallback, use_history=use_history, + model=model, ) done.set() @@ -3627,6 +3897,7 @@ def open_ended_with_thinking( workloads: list[dict[str, Any]], history_lines: list[str], mode: str, + allow_tools: bool, ) -> str: result: dict[str, str] = {"reply": ""} done = threading.Event() @@ -3641,6 +3912,7 @@ def open_ended_with_thinking( workloads=workloads, history_lines=history_lines, mode=mode, + allow_tools=allow_tools, state=state, ) done.set() @@ -3766,39 +4038,24 @@ def sync_loop(token: str, room_id: str): extra = "VictoriaMetrics (PromQL result):\n" + rendered send_msg(token, rid, extra) continue - fallback = "I don't have enough data to answer that." - if cluster_query: - open_ended = ( - _is_subjective_query(cleaned_body) - or _knowledge_intent(cleaned_body) - or _is_overview_query(cleaned_body) - or _doc_intent(cleaned_body) + reply = open_ended_with_thinking( + token, + rid, + cleaned_body, + inventory=inventory, + snapshot=snapshot, + workloads=workloads, + history_lines=history[hist_key], + mode=mode if mode in ("fast", "deep") else "deep", + allow_tools=allow_tools, ) - if open_ended: - reply = open_ended_with_thinking( - token, - rid, - cleaned_body, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history[hist_key], - mode=mode if mode in ("fast", "deep") else "deep", - ) - else: - reply = ( - cluster_answer( - cleaned_body, - inventory=inventory, - snapshot=snapshot, - workloads=workloads, - history_lines=history[hist_key], - ) - or fallback - ) else: - reply = _non_cluster_reply(cleaned_body) + reply = _non_cluster_reply( + cleaned_body, + history_lines=history[hist_key], + mode=mode if mode in ("fast", "deep") else "deep", + ) send_msg(token, rid, reply) history[hist_key].append(f"Atlas: {reply}") history[hist_key] = history[hist_key][-80:] From 86623b4596016c1832a21eb63ba6366387db13ca Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:49:28 -0300 Subject: [PATCH 385/416] atlasbot: fix insight scoring --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index b08f20db..26699b3c 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-82 + checksum/atlasbot-configmap: manual-atlasbot-83 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7e6341e6..dd6ea2ee 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2724,6 +2724,14 @@ def _insight_score( return base +def _score_insight(insight: dict[str, Any], preference: str) -> float: + relevance = _normalize_fraction(insight.get("relevance"), default=0.6) + novelty = _normalize_fraction(insight.get("novelty"), default=0.5) + if preference == "novelty": + return novelty * 0.6 + relevance * 0.4 + return relevance * 0.6 + novelty * 0.4 + + def _select_diverse_insights( candidates: list[dict[str, Any]], *, From a8bebb39346e83a01ee10cdfca8904a7a88529a0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 27 Jan 2026 23:57:36 -0300 Subject: [PATCH 386/416] atlasbot: speed up fast mode --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 57 +++++++++++++++---------- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 26699b3c..b9b8ea70 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-83 + checksum/atlasbot-configmap: manual-atlasbot-84 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index dd6ea2ee..91084783 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3201,28 +3201,37 @@ def _open_ended_multi( state: ThoughtState | None = None, ) -> str: model = _model_for_mode(mode) - angle_count = 2 if mode == "fast" else 4 - insight_count = 2 if mode == "fast" else 4 - total_steps = 2 + angle_count + 2 + (1 if mode == "deep" else 0) + if mode == "fast": + angle_count = 1 + insight_count = 1 + total_steps = 2 + else: + angle_count = 4 + insight_count = 4 + total_steps = 2 + angle_count + 2 + 1 if state: state.total_steps = total_steps - angles = _open_ended_plan( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - count=angle_count, - state=state, - model=model, - ) - insights = _open_ended_insights( - prompt, - fact_pack=fact_pack, - fact_meta=fact_meta, - history_lines=history_lines, - count=insight_count, - state=state, - model=model, - ) + + angles: list[dict[str, Any]] = [] + insights: list[dict[str, Any]] = [] + if mode != "fast": + angles = _open_ended_plan( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + count=angle_count, + state=state, + model=model, + ) + insights = _open_ended_insights( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + count=insight_count, + state=state, + model=model, + ) seeds = _seed_insights(fact_lines, fact_meta, limit=max(4, insight_count)) insight_candidates = insights + seeds subjective = _is_subjective_query(prompt) @@ -3261,7 +3270,7 @@ def _open_ended_multi( ) candidates: list[dict[str, Any]] = [] - step = 3 + step = 1 if mode == "fast" else 3 for angle in angle_inputs[:angle_count]: candidates.append( _open_ended_candidate( @@ -3308,8 +3317,10 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: - angle_count = 2 if mode == "fast" else 4 - return 2 + angle_count + 2 + (1 if mode == "deep" else 0) + if mode == "fast": + return 2 + angle_count = 4 + return 2 + angle_count + 2 + 1 def _open_ended_fast( From 683dad9e201d72389f46b3fe58036f121b74cb01 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 00:22:32 -0300 Subject: [PATCH 387/416] atlasbot: improve multi-pass synthesis --- services/comms/scripts/atlasbot/bot.py | 307 +++++++++++++++++++------ 1 file changed, 239 insertions(+), 68 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 91084783..df718e6e 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2559,8 +2559,13 @@ def _fact_pack_lines( return lines -def _fact_pack_text(lines: list[str]) -> str: - labeled = [f"F{idx + 1}: {line}" for idx, line in enumerate(lines)] +def _fact_pack_text(lines: list[str], fact_meta: dict[str, dict[str, Any]]) -> str: + labeled: list[str] = [] + for idx, line in enumerate(lines): + fid = f"F{idx + 1}" + tags = fact_meta.get(fid, {}).get("tags") or [] + tag_text = f" [tags: {', '.join(tags)}]" if tags else "" + labeled.append(f"{fid}{tag_text}: {line}") return "Fact pack:\n" + "\n".join(labeled) @@ -2782,7 +2787,8 @@ def _open_ended_system() -> str: "Use ONLY the provided fact pack and recent chat as your evidence. " "You may draw light inferences if you label them as such. " "Write concise, human sentences with a helpful, calm tone (not a list). " - "If the question is subjective, share a light opinion grounded in facts and explain why it stands out. " + "If the question is subjective (cool/interesting/unconventional), pick a standout fact and explain why it stands out. " + "If the question asks for a list, embed the list inline in a sentence (comma-separated). " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " "Do not invent numbers or facts. " @@ -2938,6 +2944,67 @@ def _open_ended_plan( return cleaned +def _sanitize_focus_tags(raw_tags: list[Any]) -> list[str]: + tags: list[str] = [] + for tag in raw_tags: + if not isinstance(tag, str): + continue + tag = tag.strip() + if tag in _ALLOWED_INSIGHT_TAGS and tag not in tags: + tags.append(tag) + return tags + + +def _open_ended_interpret( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + state: ThoughtState | None, + model: str | None, +) -> dict[str, Any]: + if state: + state.update("interpreting", step=1, note="reading question") + allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS)) + prompt_text = ( + "Classify how to answer the question using only the fact pack. " + "Return JSON: {\"style\":\"objective|subjective\"," + "\"tone\":\"neutral|curious|enthusiastic\"," + "\"focus_tags\":[\"tag\"]," + "\"focus_label\":\"short phrase\"," + "\"allow_list\":true|false}. " + "Use allow_list=true only if the question explicitly asks for names or lists. " + f"Only use tags from: {allowed_tags}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) + if not isinstance(result, dict): + result = {} + style = str(result.get("style") or "").strip().lower() + if style not in ("objective", "subjective"): + style = "subjective" if _is_subjective_query(prompt) else "objective" + tone = str(result.get("tone") or "neutral").strip().lower() + if tone not in ("neutral", "curious", "enthusiastic"): + tone = "neutral" + focus_tags = _sanitize_focus_tags(result.get("focus_tags") or []) + focus_label = str(result.get("focus_label") or "").strip() + allow_list = result.get("allow_list") + if not isinstance(allow_list, bool): + q = normalize_query(prompt) + allow_list = any(phrase in q for phrase in ("list", "which", "what are", "names")) + return { + "style": style, + "tone": tone, + "focus_tags": focus_tags, + "focus_label": focus_label, + "allow_list": allow_list, + } + + def _preferred_tags_for_prompt(prompt: str) -> set[str]: q = normalize_query(prompt) tags: set[str] = set() @@ -3013,6 +3080,71 @@ def _open_ended_insights( return cleaned +def _fallback_fact_ids( + fact_meta: dict[str, dict[str, Any]], + *, + focus_tags: set[str], + count: int, +) -> list[str]: + if not fact_meta: + return [] + if focus_tags: + tagged = [ + fid + for fid, meta in fact_meta.items() + if focus_tags & set(meta.get("tags") or []) + ] + if tagged: + return tagged[:count] + return list(fact_meta.keys())[:count] + + +def _open_ended_select_facts( + prompt: str, + *, + fact_pack: str, + fact_meta: dict[str, dict[str, Any]], + history_lines: list[str], + focus_tags: set[str], + avoid_fact_ids: list[str], + count: int, + subjective: bool, + state: ThoughtState | None, + step: int, + model: str | None, +) -> list[str]: + if state: + state.update("selecting facts", step=step, note="picking evidence") + focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any" + avoid_hint = ", ".join(avoid_fact_ids) if avoid_fact_ids else "none" + prompt_text = ( + "Select the fact IDs that best answer the question. " + f"Pick up to {count} fact IDs. " + f"Focus tags: {focus_hint}. " + f"Avoid these fact IDs: {avoid_hint}. " + "If the question is subjective, pick standout or unusual facts; " + "if objective, pick the minimal facts needed. " + "Return JSON: {\"fact_ids\":[\"F1\"...],\"note\":\"...\"}." + ) + context = _append_history_context(fact_pack, history_lines) + result = _ollama_json_call( + prompt_text + f" Question: {prompt}", + context=context, + model=model, + ) + fact_ids = result.get("fact_ids") if isinstance(result, dict) else None + selected: list[str] = [] + if isinstance(fact_ids, list): + for fid in fact_ids: + if isinstance(fid, str) and fid in fact_meta and fid not in selected: + selected.append(fid) + if len(selected) >= count: + break + if not selected: + selected = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count) + return selected + + def _normalize_score(value: Any, *, default: int = 60) -> int: if isinstance(value, (int, float)): return int(max(0, min(100, value))) @@ -3043,6 +3175,9 @@ def _open_ended_candidate( focus: str, fact_pack: str, history_lines: list[str], + subjective: bool, + tone: str, + allow_list: bool, state: ThoughtState | None, step: int, fact_hints: list[str] | None = None, @@ -3053,10 +3188,23 @@ def _open_ended_candidate( hint_text = "" if fact_hints: hint_text = " Prioritize these fact IDs if relevant: " + ", ".join(fact_hints) + "." + style_hint = ( + "Offer a brief opinion grounded in facts and explain why it stands out. " + if subjective + else "Answer directly and succinctly. " + ) + list_hint = ( + "If a list is requested, embed it inline in a sentence (comma-separated). " + if allow_list + else "Avoid bullet lists. " + ) prompt_text = ( "Using ONLY the fact pack, answer the question focusing on this angle: " f"{focus}. " - "Write 2-4 sentences in plain prose (not a list)." + f"Tone: {tone}. " + + style_hint + + list_hint + + "Write 2-4 sentences in plain prose." + hint_text + " " "If you infer, label it as inference. " @@ -3125,6 +3273,9 @@ def _open_ended_synthesize( fact_pack: str, history_lines: list[str], candidates: list[dict[str, Any]], + subjective: bool, + tone: str, + allow_list: bool, state: ThoughtState | None, step: int, model: str | None, @@ -3133,6 +3284,16 @@ def _open_ended_synthesize( if state: state.update("synthesizing", step=step, note="composing answer") critique_block = f"\nCritique guidance: {critique}\n" if critique else "\n" + style_hint = ( + "If the question is subjective, share a light opinion grounded in facts and explain why it stands out. " + if subjective + else "Answer directly without extra caveats. " + ) + list_hint = ( + "If a list is requested, embed it inline in a sentence (comma-separated). " + if allow_list + else "Avoid bullet lists. " + ) synth_prompt = ( "Compose the final answer to the question using the candidate answers below. " "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. " @@ -3140,7 +3301,10 @@ def _open_ended_synthesize( "If you infer, label it as inference. " "Do not claim nodes are missing or not ready unless the fact pack explicitly lists " "nodes_not_ready or expected_workers_missing. " - "Keep the tone conversational and answer the user's intent directly. " + f"Tone: {tone}. " + + style_hint + + list_hint + + "Keep the tone conversational and answer the user's intent directly. " "Avoid repeating the last response if possible. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), " "HallucinationRisk (low|medium|high).\n" @@ -3202,85 +3366,90 @@ def _open_ended_multi( ) -> str: model = _model_for_mode(mode) if mode == "fast": - angle_count = 1 - insight_count = 1 - total_steps = 2 + total_steps = 4 else: - angle_count = 4 - insight_count = 4 - total_steps = 2 + angle_count + 2 + 1 + total_steps = 7 if state: state.total_steps = total_steps - angles: list[dict[str, Any]] = [] - insights: list[dict[str, Any]] = [] - if mode != "fast": - angles = _open_ended_plan( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - count=angle_count, - state=state, - model=model, - ) - insights = _open_ended_insights( + interpretation = _open_ended_interpret( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + state=state, + model=model, + ) + style = interpretation.get("style") or "objective" + subjective = style == "subjective" or _is_subjective_query(prompt) + tone = str(interpretation.get("tone") or "").strip().lower() + if tone not in ("neutral", "curious", "enthusiastic"): + tone = "curious" if subjective else "neutral" + allow_list = bool(interpretation.get("allow_list")) + focus_tags = set(interpretation.get("focus_tags") or []) or _preferred_tags_for_prompt(prompt) + if not focus_tags and subjective: + focus_tags = set(_ALLOWED_INSIGHT_TAGS) + + primary_ids = _open_ended_select_facts( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + focus_tags=focus_tags, + avoid_fact_ids=[], + count=4 if mode == "deep" else 3, + subjective=subjective, + state=state, + step=2, + model=model, + ) + alternate_ids: list[str] = [] + if mode == "deep": + alternate_ids = _open_ended_select_facts( prompt, fact_pack=fact_pack, fact_meta=fact_meta, history_lines=history_lines, - count=insight_count, + focus_tags=focus_tags, + avoid_fact_ids=primary_ids, + count=4, + subjective=subjective, state=state, + step=3, model=model, ) - seeds = _seed_insights(fact_lines, fact_meta, limit=max(4, insight_count)) - insight_candidates = insights + seeds - subjective = _is_subjective_query(prompt) - prefer_tags = _preferred_tags_for_prompt(prompt) - history_tags = _history_tags(history_lines) - avoid_tags = history_tags if subjective else set() - preference = "novelty" if subjective else "relevance" - selected_insights = _select_diverse_insights( - insight_candidates, - preference=preference, - prefer_tags=prefer_tags, - avoid_tags=avoid_tags, - history_tags=history_tags, - fact_meta=fact_meta, - count=1 if mode == "fast" else 2, - ) - if state and selected_insights: - state.update("analyzing", note=_candidate_note(selected_insights[0])) - - angle_inputs: list[dict[str, Any]] = [] - for insight in selected_insights: - angle_inputs.append( - { - "focus": str(insight.get("summary") or "Direct answer"), - "fact_ids": insight.get("fact_ids") or [], - } - ) - for angle in angles: - if len(angle_inputs) >= angle_count: - break - angle_inputs.append( - { - "focus": str(angle.get("focus") or "Direct answer"), - "fact_ids": [], - } - ) candidates: list[dict[str, Any]] = [] - step = 1 if mode == "fast" else 3 - for angle in angle_inputs[:angle_count]: + focus_label = interpretation.get("focus_label") or "primary angle" + step = 3 if mode == "fast" else 4 + candidates.append( + _open_ended_candidate( + prompt, + focus=str(focus_label), + fact_pack=fact_pack, + history_lines=history_lines, + subjective=subjective, + tone=str(tone), + allow_list=allow_list, + state=state, + step=step, + fact_hints=primary_ids, + model=model, + ) + ) + step += 1 + if mode == "deep" and alternate_ids: candidates.append( _open_ended_candidate( prompt, - focus=str(angle.get("focus") or "Direct answer"), + focus="alternate angle", fact_pack=fact_pack, history_lines=history_lines, + subjective=subjective, + tone=str(tone), + allow_list=allow_list, state=state, step=step, - fact_hints=angle.get("fact_ids") if isinstance(angle.get("fact_ids"), list) else None, + fact_hints=alternate_ids, model=model, ) ) @@ -3306,6 +3475,9 @@ def _open_ended_multi( fact_pack=fact_pack, history_lines=history_lines, candidates=selected or candidates, + subjective=subjective, + tone=str(tone), + allow_list=allow_list, state=state, step=step, model=model, @@ -3318,9 +3490,8 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: if mode == "fast": - return 2 - angle_count = 4 - return 2 + angle_count + 2 + 1 + return 4 + return 7 def _open_ended_fast( @@ -3386,8 +3557,8 @@ def open_ended_answer( lines.extend(tool_lines) if not lines: return _ensure_scores("I don't have enough data to answer that.") - fact_pack = _fact_pack_text(lines) fact_meta = _fact_pack_meta(lines) + fact_pack = _fact_pack_text(lines, fact_meta) if mode == "fast": return _open_ended_fast( prompt, From 2f6a64870b5b0009e3d064d514c8d6e6f29ac356 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 00:24:13 -0300 Subject: [PATCH 388/416] atlasbot: roll config --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index b9b8ea70..bc6790bd 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-84 + checksum/atlasbot-configmap: manual-atlasbot-85 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 659d07a7aa33597a5491a1a0cdfca65c900c4368 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:02:14 -0300 Subject: [PATCH 389/416] atlasbot: enrich fact pack and selection --- services/comms/scripts/atlasbot/bot.py | 104 +++++++++++++++++++++++-- 1 file changed, 96 insertions(+), 8 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index df718e6e..55c6da2b 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -936,6 +936,28 @@ def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]: per_node.setdefault(node, {})[metric_name] = entry.get("value") return [{"node": node, **vals} for node, vals in sorted(per_node.items())] +def _usage_extremes(usage_table: list[dict[str, Any]]) -> dict[str, tuple[str, float]]: + extremes: dict[str, tuple[str, float]] = {} + for metric in ("cpu", "ram", "net", "io"): + values: list[tuple[str, float]] = [] + for entry in usage_table: + node = entry.get("node") + raw = entry.get(metric) + if not node or raw is None: + continue + try: + value = float(raw) + except (TypeError, ValueError): + continue + values.append((node, value)) + if not values: + continue + lowest = min(values, key=lambda item: item[1]) + highest = max(values, key=lambda item: item[1]) + extremes[f"min_{metric}"] = lowest + extremes[f"max_{metric}"] = highest + return extremes + def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> list[dict[str, Any]]: cleaned: list[dict[str, Any]] = [] for entry in workloads: @@ -1023,6 +1045,13 @@ def facts_context( lines.append(f"- arch {key}: {', '.join(nodes_list)}") if control_plane_nodes: lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}") + control_plane_by_hw: dict[str, list[str]] = collections.defaultdict(list) + for node in inv: + if node.get("name") in control_plane_nodes: + control_plane_by_hw[node.get("hardware") or "unknown"].append(node["name"]) + parts = [f"{hw}={', '.join(sorted(nodes))}" for hw, nodes in sorted(control_plane_by_hw.items())] + if parts: + lines.append(f"- control_plane_by_hardware: {', '.join(parts)}") if worker_nodes: lines.append(f"- worker_nodes: {', '.join(worker_nodes)}") if ready_workers or not_ready_workers: @@ -1068,6 +1097,22 @@ def facts_context( if value is not None: lines.append(f"- {key}: {value}") + top_restarts = metrics.get("top_restarts_1h") if isinstance(metrics.get("top_restarts_1h"), list) else [] + if top_restarts: + items = [] + for entry in top_restarts[:5]: + if not isinstance(entry, dict): + continue + metric = entry.get("metric") or {} + pod = metric.get("pod") or metric.get("name") or "" + ns = metric.get("namespace") or "" + value = entry.get("value") + label = f"{ns}/{pod}".strip("/") + if label and value is not None: + items.append(f"{label}={value}") + if items: + lines.append(f"- top_restarts_1h: {', '.join(items)}") + usage_table = _node_usage_table(metrics) if usage_table: lines.append("- node_usage (cpu/ram/net/io):") @@ -1088,6 +1133,18 @@ def facts_context( else "" ) lines.append(f" - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}") + extremes = _usage_extremes(usage_table) + for metric in ("cpu", "ram", "net", "io"): + min_key = f"min_{metric}" + if min_key not in extremes: + continue + node, value = extremes[min_key] + value_fmt = _format_metric_value( + str(value), + percent=metric in ("cpu", "ram"), + rate=metric in ("net", "io"), + ) + lines.append(f"- lowest_{metric}: {node} ({value_fmt})") if nodes_in_query: lines.append("- node_details:") @@ -1112,13 +1169,37 @@ def facts_context( wl = entry.get("workload") or "" primary = entry.get("primary_node") or "" pods_total = entry.get("pods_total") + pods_running = entry.get("pods_running") label = f"{ns}/{wl}" if ns and wl else (wl or ns) if not label: continue if primary: - lines.append(f" - {label}: primary_node={primary}, pods_total={pods_total}") + lines.append( + f" - {label}: primary_node={primary}, pods_total={pods_total}, pods_running={pods_running}" + ) else: - lines.append(f" - {label}: pods_total={pods_total}") + lines.append(f" - {label}: pods_total={pods_total}, pods_running={pods_running}") + top = max( + (entry for entry in workload_entries if isinstance(entry.get("pods_total"), (int, float))), + key=lambda item: item.get("pods_total", 0), + default=None, + ) + if isinstance(top, dict) and top.get("pods_total") is not None: + label = f"{top.get('namespace')}/{top.get('workload')}".strip("/") + lines.append(f"- workload_most_pods: {label} ({top.get('pods_total')})") + zero_running = [ + entry + for entry in workload_entries + if isinstance(entry.get("pods_running"), (int, float)) and entry.get("pods_running") == 0 + ] + if zero_running: + labels = [] + for entry in zero_running: + label = f"{entry.get('namespace')}/{entry.get('workload')}".strip("/") + if label: + labels.append(label) + if labels: + lines.append(f"- workloads_zero_running: {', '.join(labels)}") rendered = "\n".join(lines) return rendered[:MAX_FACTS_CHARS] @@ -2609,15 +2690,15 @@ def _fact_line_tags(line: str) -> set[str]: tags.add("architecture") if any(key in text for key in ("rpi", "jetson", "amd64", "arm64", "non_raspberry_pi")): tags.update({"hardware", "inventory"}) - if "control_plane_nodes" in text or "worker_nodes" in text: + if "control_plane_nodes" in text or "control_plane_by_hardware" in text or "worker_nodes" in text: tags.add("inventory") - if any(key in text for key in ("hottest_", "node_usage", "cpu=", "ram=", "net=", "io=")): + if any(key in text for key in ("hottest_", "lowest_", "node_usage", "cpu=", "ram=", "net=", "io=")): tags.add("utilization") if "postgres_" in text or "postgres connections" in text: tags.add("database") - if "pods_" in text or "pod phases" in text: + if "pods_" in text or "pod phases" in text or "restarts" in text: tags.add("pods") - if "workloads" in text or "primary_node" in text: + if "workloads" in text or "primary_node" in text or "workload_" in text: tags.add("workloads") if "node_details" in text: tags.add("node_detail") @@ -3140,8 +3221,15 @@ def _open_ended_select_facts( selected.append(fid) if len(selected) >= count: break - if not selected: - selected = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count) + seed = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count) + if selected: + for fid in seed: + if fid not in selected: + selected.append(fid) + if len(selected) >= count: + break + else: + selected = seed return selected From d151bcde6fa9e532893802ad197127bcf35e1805 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:02:32 -0300 Subject: [PATCH 390/416] atlasbot: roll config --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index bc6790bd..7ce144c6 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-85 + checksum/atlasbot-configmap: manual-atlasbot-86 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 62feb26afd1cb7719b299f73773a72f47e9971c7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:07:13 -0300 Subject: [PATCH 391/416] atlasbot: add per-hardware extremes --- services/comms/scripts/atlasbot/bot.py | 27 ++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 55c6da2b..50fed4b6 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1145,6 +1145,33 @@ def facts_context( rate=metric in ("net", "io"), ) lines.append(f"- lowest_{metric}: {node} ({value_fmt})") + for metric in ("cpu", "ram"): + hottest_parts: list[str] = [] + lowest_parts: list[str] = [] + for hw, nodes_list in sorted(by_hardware.items()): + entries = [] + for entry in usage_table: + node = entry.get("node") + if node in nodes_list and entry.get(metric) is not None: + try: + value = float(entry.get(metric)) + except (TypeError, ValueError): + continue + entries.append((node, value)) + if not entries: + continue + max_node, max_val = max(entries, key=lambda item: item[1]) + min_node, min_val = min(entries, key=lambda item: item[1]) + hottest_parts.append( + f"{hw}={max_node} ({_format_metric_value(str(max_val), percent=True)})" + ) + lowest_parts.append( + f"{hw}={min_node} ({_format_metric_value(str(min_val), percent=True)})" + ) + if hottest_parts: + lines.append(f"- hottest_{metric}_by_hardware: {', '.join(hottest_parts)}") + if lowest_parts: + lines.append(f"- lowest_{metric}_by_hardware: {', '.join(lowest_parts)}") if nodes_in_query: lines.append("- node_details:") From 12f9dbdb4e0e99dd59d37f0846c2915f00072779 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:07:26 -0300 Subject: [PATCH 392/416] comms: roll atlasbot config --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7ce144c6..1d89335a 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-86 + checksum/atlasbot-configmap: manual-atlasbot-87 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 845be4efbccd7a5cea1286525ae5c20ee335f2d5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:35:34 -0300 Subject: [PATCH 393/416] maintenance: suspend ariadne migrate job --- services/maintenance/ariadne-migrate-job.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml index b9b1496f..367a1a09 100644 --- a/services/maintenance/ariadne-migrate-job.yaml +++ b/services/maintenance/ariadne-migrate-job.yaml @@ -7,6 +7,7 @@ metadata: annotations: kustomize.toolkit.fluxcd.io/force: "true" spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: @@ -15,6 +16,7 @@ spec: app: ariadne-migrate annotations: vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/agent-pre-populate-only: "true" vault.hashicorp.com/role: "maintenance" vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" vault.hashicorp.com/agent-inject-template-ariadne-env.sh: | From bc5927020285ef319d2d563c2787bb7b0b1ded01 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:48:32 -0300 Subject: [PATCH 394/416] chore: organize one-off jobs --- .../kustomization.yaml | 2 +- services/bstein-dev-home/kustomization.yaml | 2 +- .../migrations/kustomization.yaml | 2 +- .../migrations/portal-migrate-job.yaml | 7 ++++- .../portal-onboarding-e2e-test-job.yaml | 7 ++++- services/comms/kustomization.yaml | 20 ++++++------- .../{ => oneoffs}/bstein-force-leave-job.yaml | 7 ++++- .../comms-secrets-ensure-job.yaml | 7 ++++- .../mas-admin-client-secret-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/mas-db-ensure-job.yaml | 7 ++++- .../mas-local-users-ensure-job.yaml | 7 ++++- .../othrys-kick-numeric-job.yaml | 7 ++++- .../synapse-admin-ensure-job.yaml | 7 ++++- .../synapse-seeder-admin-ensure-job.yaml | 7 ++++- .../synapse-signingkey-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/synapse-user-seed-job.yaml | 7 ++++- services/finance/kustomization.yaml | 2 +- .../finance-secrets-ensure-job.yaml | 7 ++++- services/keycloak/kustomization.yaml | 30 +++++++++---------- .../actual-oidc-secret-ensure-job.yaml | 7 ++++- .../harbor-oidc-secret-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/ldap-federation-job.yaml | 7 ++++- .../logs-oidc-secret-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/mas-secrets-ensure-job.yaml | 7 ++++- ...portal-admin-client-secret-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/portal-e2e-client-job.yaml | 7 ++++- ...al-e2e-execute-actions-email-test-job.yaml | 7 ++++- .../portal-e2e-target-client-job.yaml | 7 ++++- ...al-e2e-token-exchange-permissions-job.yaml | 7 ++++- .../portal-e2e-token-exchange-test-job.yaml | 7 ++++- .../{ => oneoffs}/realm-settings-job.yaml | 7 ++++- .../synapse-oidc-secret-ensure-job.yaml | 7 ++++- .../{ => oneoffs}/user-overrides-job.yaml | 7 ++++- .../vault-oidc-secret-ensure-job.yaml | 7 ++++- services/logging/kustomization.yaml | 6 ++-- .../opensearch-dashboards-setup-job.yaml | 7 ++++- .../{ => oneoffs}/opensearch-ism-job.yaml | 7 ++++- .../opensearch-observability-setup-job.yaml | 7 ++++- services/mailu/kustomization.yaml | 2 +- .../mailu/{ => oneoffs}/mailu-sync-job.yaml | 7 ++++- services/maintenance/kustomization.yaml | 4 +-- .../{ => oneoffs}/ariadne-migrate-job.yaml | 6 +++- .../k3s-traefik-cleanup-job.yaml | 7 ++++- services/monitoring/kustomization.yaml | 4 +-- .../{ => oneoffs}/grafana-org-bootstrap.yaml | 7 ++++- .../grafana-user-dedupe-job.yaml | 7 ++++- 46 files changed, 252 insertions(+), 73 deletions(-) rename services/bstein-dev-home/{ => oneoffs}/migrations/kustomization.yaml (66%) rename services/bstein-dev-home/{ => oneoffs}/migrations/portal-migrate-job.yaml (78%) rename services/bstein-dev-home/{ => oneoffs}/portal-onboarding-e2e-test-job.yaml (89%) rename services/comms/{ => oneoffs}/bstein-force-leave-job.yaml (96%) rename services/comms/{ => oneoffs}/comms-secrets-ensure-job.yaml (92%) rename services/comms/{ => oneoffs}/mas-admin-client-secret-ensure-job.yaml (90%) rename services/comms/{ => oneoffs}/mas-db-ensure-job.yaml (91%) rename services/comms/{ => oneoffs}/mas-local-users-ensure-job.yaml (97%) rename services/comms/{ => oneoffs}/othrys-kick-numeric-job.yaml (96%) rename services/comms/{ => oneoffs}/synapse-admin-ensure-job.yaml (96%) rename services/comms/{ => oneoffs}/synapse-seeder-admin-ensure-job.yaml (93%) rename services/comms/{ => oneoffs}/synapse-signingkey-ensure-job.yaml (88%) rename services/comms/{ => oneoffs}/synapse-user-seed-job.yaml (96%) rename services/finance/{ => oneoffs}/finance-secrets-ensure-job.yaml (83%) rename services/keycloak/{ => oneoffs}/actual-oidc-secret-ensure-job.yaml (83%) rename services/keycloak/{ => oneoffs}/harbor-oidc-secret-ensure-job.yaml (83%) rename services/keycloak/{ => oneoffs}/ldap-federation-job.yaml (98%) rename services/keycloak/{ => oneoffs}/logs-oidc-secret-ensure-job.yaml (94%) rename services/keycloak/{ => oneoffs}/mas-secrets-ensure-job.yaml (95%) rename services/keycloak/{ => oneoffs}/portal-admin-client-secret-ensure-job.yaml (96%) rename services/keycloak/{ => oneoffs}/portal-e2e-client-job.yaml (97%) rename services/keycloak/{ => oneoffs}/portal-e2e-execute-actions-email-test-job.yaml (89%) rename services/keycloak/{ => oneoffs}/portal-e2e-target-client-job.yaml (95%) rename services/keycloak/{ => oneoffs}/portal-e2e-token-exchange-permissions-job.yaml (97%) rename services/keycloak/{ => oneoffs}/portal-e2e-token-exchange-test-job.yaml (89%) rename services/keycloak/{ => oneoffs}/realm-settings-job.yaml (98%) rename services/keycloak/{ => oneoffs}/synapse-oidc-secret-ensure-job.yaml (92%) rename services/keycloak/{ => oneoffs}/user-overrides-job.yaml (96%) rename services/keycloak/{ => oneoffs}/vault-oidc-secret-ensure-job.yaml (83%) rename services/logging/{ => oneoffs}/opensearch-dashboards-setup-job.yaml (88%) rename services/logging/{ => oneoffs}/opensearch-ism-job.yaml (91%) rename services/logging/{ => oneoffs}/opensearch-observability-setup-job.yaml (76%) rename services/mailu/{ => oneoffs}/mailu-sync-job.yaml (93%) rename services/maintenance/{ => oneoffs}/ariadne-migrate-job.yaml (82%) rename services/maintenance/{ => oneoffs}/k3s-traefik-cleanup-job.yaml (77%) rename services/monitoring/{ => oneoffs}/grafana-org-bootstrap.yaml (93%) rename services/monitoring/{ => oneoffs}/grafana-user-dedupe-job.yaml (94%) diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml index da61b2d1..ff97f73b 100644 --- a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml @@ -6,7 +6,7 @@ metadata: namespace: flux-system spec: interval: 10m - path: ./services/bstein-dev-home/migrations + path: ./services/bstein-dev-home/oneoffs/migrations prune: true force: true sourceRef: diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml index a8132417..f62fb171 100644 --- a/services/bstein-dev-home/kustomization.yaml +++ b/services/bstein-dev-home/kustomization.yaml @@ -16,7 +16,7 @@ resources: - backend-deployment.yaml - backend-service.yaml - vaultwarden-cred-sync-cronjob.yaml - - portal-onboarding-e2e-test-job.yaml + - oneoffs/portal-onboarding-e2e-test-job.yaml - ingress.yaml images: - name: registry.bstein.dev/bstein/bstein-dev-home-frontend diff --git a/services/bstein-dev-home/migrations/kustomization.yaml b/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml similarity index 66% rename from services/bstein-dev-home/migrations/kustomization.yaml rename to services/bstein-dev-home/oneoffs/migrations/kustomization.yaml index 067665bc..1d1dfc82 100644 --- a/services/bstein-dev-home/migrations/kustomization.yaml +++ b/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml @@ -1,4 +1,4 @@ -# services/bstein-dev-home/migrations/kustomization.yaml +# services/bstein-dev-home/oneoffs/migrations/kustomization.yaml apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: bstein-dev-home diff --git a/services/bstein-dev-home/migrations/portal-migrate-job.yaml b/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml similarity index 78% rename from services/bstein-dev-home/migrations/portal-migrate-job.yaml rename to services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml index 9d052546..1f7e092b 100644 --- a/services/bstein-dev-home/migrations/portal-migrate-job.yaml +++ b/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml @@ -1,4 +1,8 @@ -# services/bstein-dev-home/migrations/portal-migrate-job.yaml +# services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml +# One-off job for bstein-dev-home/bstein-dev-home-portal-migrate-36. +# Purpose: bstein dev home portal migrate 36 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: @@ -7,6 +11,7 @@ metadata: annotations: kustomize.toolkit.fluxcd.io/force: "true" spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml similarity index 89% rename from services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml rename to services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml index 681e89d2..9923499b 100644 --- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +++ b/services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml @@ -1,10 +1,15 @@ -# services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml +# services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml +# One-off job for bstein-dev-home/portal-onboarding-e2e-test-27. +# Purpose: portal onboarding e2e test 27 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: portal-onboarding-e2e-test-27 namespace: bstein-dev-home spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml index 01d7be5c..969ca586 100644 --- a/services/comms/kustomization.yaml +++ b/services/comms/kustomization.yaml @@ -22,24 +22,24 @@ resources: - mas-db-ensure-rbac.yaml - synapse-signingkey-ensure-rbac.yaml - vault-sync-deployment.yaml - - mas-admin-client-secret-ensure-job.yaml - - mas-db-ensure-job.yaml - - comms-secrets-ensure-job.yaml - - synapse-admin-ensure-job.yaml - - synapse-signingkey-ensure-job.yaml - - synapse-seeder-admin-ensure-job.yaml - - synapse-user-seed-job.yaml - - mas-local-users-ensure-job.yaml + - oneoffs/mas-admin-client-secret-ensure-job.yaml + - oneoffs/mas-db-ensure-job.yaml + - oneoffs/comms-secrets-ensure-job.yaml + - oneoffs/synapse-admin-ensure-job.yaml + - oneoffs/synapse-signingkey-ensure-job.yaml + - oneoffs/synapse-seeder-admin-ensure-job.yaml + - oneoffs/synapse-user-seed-job.yaml + - oneoffs/mas-local-users-ensure-job.yaml - mas-deployment.yaml - livekit-token-deployment.yaml - livekit.yaml - coturn.yaml - seed-othrys-room.yaml - guest-name-job.yaml - - othrys-kick-numeric-job.yaml + - oneoffs/othrys-kick-numeric-job.yaml - pin-othrys-job.yaml - reset-othrys-room-job.yaml - - bstein-force-leave-job.yaml + - oneoffs/bstein-force-leave-job.yaml - livekit-ingress.yaml - livekit-middlewares.yaml - matrix-ingress.yaml diff --git a/services/comms/bstein-force-leave-job.yaml b/services/comms/oneoffs/bstein-force-leave-job.yaml similarity index 96% rename from services/comms/bstein-force-leave-job.yaml rename to services/comms/oneoffs/bstein-force-leave-job.yaml index 0286f8c8..7efe826e 100644 --- a/services/comms/bstein-force-leave-job.yaml +++ b/services/comms/oneoffs/bstein-force-leave-job.yaml @@ -1,10 +1,15 @@ -# services/comms/bstein-force-leave-job.yaml +# services/comms/oneoffs/bstein-force-leave-job.yaml +# One-off job for comms/bstein-leave-rooms-12. +# Purpose: bstein leave rooms 12 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: bstein-leave-rooms-12 namespace: comms spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/comms/comms-secrets-ensure-job.yaml b/services/comms/oneoffs/comms-secrets-ensure-job.yaml similarity index 92% rename from services/comms/comms-secrets-ensure-job.yaml rename to services/comms/oneoffs/comms-secrets-ensure-job.yaml index 52904cc9..35ca73c5 100644 --- a/services/comms/comms-secrets-ensure-job.yaml +++ b/services/comms/oneoffs/comms-secrets-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/comms-secrets-ensure-job.yaml +# services/comms/oneoffs/comms-secrets-ensure-job.yaml +# One-off job for comms/comms-secrets-ensure-7. +# Purpose: comms secrets ensure 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: comms-secrets-ensure-7 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/comms/mas-admin-client-secret-ensure-job.yaml b/services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml similarity index 90% rename from services/comms/mas-admin-client-secret-ensure-job.yaml rename to services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml index 7b05ccae..e1d54589 100644 --- a/services/comms/mas-admin-client-secret-ensure-job.yaml +++ b/services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml @@ -1,4 +1,8 @@ -# services/comms/mas-admin-client-secret-ensure-job.yaml +# services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml +# One-off job for comms/mas-admin-client-secret-writer. +# Purpose: mas admin client secret writer (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: v1 kind: ServiceAccount metadata: @@ -41,6 +45,7 @@ metadata: name: mas-admin-client-secret-ensure-11 namespace: comms spec: + suspend: true backoffLimit: 2 template: spec: diff --git a/services/comms/mas-db-ensure-job.yaml b/services/comms/oneoffs/mas-db-ensure-job.yaml similarity index 91% rename from services/comms/mas-db-ensure-job.yaml rename to services/comms/oneoffs/mas-db-ensure-job.yaml index 56707a9b..44137da8 100644 --- a/services/comms/mas-db-ensure-job.yaml +++ b/services/comms/oneoffs/mas-db-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/mas-db-ensure-job.yaml +# services/comms/oneoffs/mas-db-ensure-job.yaml +# One-off job for comms/mas-db-ensure-22. +# Purpose: mas db ensure 22 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: mas-db-ensure-22 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 600 template: diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/oneoffs/mas-local-users-ensure-job.yaml similarity index 97% rename from services/comms/mas-local-users-ensure-job.yaml rename to services/comms/oneoffs/mas-local-users-ensure-job.yaml index 636ee5bb..7b510727 100644 --- a/services/comms/mas-local-users-ensure-job.yaml +++ b/services/comms/oneoffs/mas-local-users-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/mas-local-users-ensure-job.yaml +# services/comms/oneoffs/mas-local-users-ensure-job.yaml +# One-off job for comms/mas-local-users-ensure-18. +# Purpose: mas local users ensure 18 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: mas-local-users-ensure-18 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/comms/othrys-kick-numeric-job.yaml b/services/comms/oneoffs/othrys-kick-numeric-job.yaml similarity index 96% rename from services/comms/othrys-kick-numeric-job.yaml rename to services/comms/oneoffs/othrys-kick-numeric-job.yaml index 0d3914a5..e38a6bb6 100644 --- a/services/comms/othrys-kick-numeric-job.yaml +++ b/services/comms/oneoffs/othrys-kick-numeric-job.yaml @@ -1,10 +1,15 @@ -# services/comms/othrys-kick-numeric-job.yaml +# services/comms/oneoffs/othrys-kick-numeric-job.yaml +# One-off job for comms/othrys-kick-numeric-8. +# Purpose: othrys kick numeric 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: othrys-kick-numeric-8 namespace: comms spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/oneoffs/synapse-admin-ensure-job.yaml similarity index 96% rename from services/comms/synapse-admin-ensure-job.yaml rename to services/comms/oneoffs/synapse-admin-ensure-job.yaml index 5ddf60c4..95bc9f2a 100644 --- a/services/comms/synapse-admin-ensure-job.yaml +++ b/services/comms/oneoffs/synapse-admin-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/synapse-admin-ensure-job.yaml +# services/comms/oneoffs/synapse-admin-ensure-job.yaml +# One-off job for comms/synapse-admin-ensure-3. +# Purpose: synapse admin ensure 3 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: synapse-admin-ensure-3 namespace: comms spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml similarity index 93% rename from services/comms/synapse-seeder-admin-ensure-job.yaml rename to services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml index 5d2d4225..1d8972e8 100644 --- a/services/comms/synapse-seeder-admin-ensure-job.yaml +++ b/services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/synapse-seeder-admin-ensure-job.yaml +# services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml +# One-off job for comms/synapse-seeder-admin-ensure-9. +# Purpose: synapse seeder admin ensure 9 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: synapse-seeder-admin-ensure-9 namespace: comms spec: + suspend: true backoffLimit: 2 template: metadata: diff --git a/services/comms/synapse-signingkey-ensure-job.yaml b/services/comms/oneoffs/synapse-signingkey-ensure-job.yaml similarity index 88% rename from services/comms/synapse-signingkey-ensure-job.yaml rename to services/comms/oneoffs/synapse-signingkey-ensure-job.yaml index 402a820a..bbc4595b 100644 --- a/services/comms/synapse-signingkey-ensure-job.yaml +++ b/services/comms/oneoffs/synapse-signingkey-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/comms/synapse-signingkey-ensure-job.yaml +# services/comms/oneoffs/synapse-signingkey-ensure-job.yaml +# One-off job for comms/othrys-synapse-signingkey-ensure-7. +# Purpose: othrys synapse signingkey ensure 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: othrys-synapse-signingkey-ensure-7 namespace: comms spec: + suspend: true backoffLimit: 2 template: spec: diff --git a/services/comms/synapse-user-seed-job.yaml b/services/comms/oneoffs/synapse-user-seed-job.yaml similarity index 96% rename from services/comms/synapse-user-seed-job.yaml rename to services/comms/oneoffs/synapse-user-seed-job.yaml index aab88c3b..a732739a 100644 --- a/services/comms/synapse-user-seed-job.yaml +++ b/services/comms/oneoffs/synapse-user-seed-job.yaml @@ -1,10 +1,15 @@ -# services/comms/synapse-user-seed-job.yaml +# services/comms/oneoffs/synapse-user-seed-job.yaml +# One-off job for comms/synapse-user-seed-8. +# Purpose: synapse user seed 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: synapse-user-seed-8 namespace: comms spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/finance/kustomization.yaml b/services/finance/kustomization.yaml index e4c414f5..1559f5c8 100644 --- a/services/finance/kustomization.yaml +++ b/services/finance/kustomization.yaml @@ -9,7 +9,7 @@ resources: - finance-secrets-ensure-rbac.yaml - actual-budget-data-pvc.yaml - firefly-storage-pvc.yaml - - finance-secrets-ensure-job.yaml + - oneoffs/finance-secrets-ensure-job.yaml - actual-budget-deployment.yaml - firefly-deployment.yaml - firefly-user-sync-cronjob.yaml diff --git a/services/finance/finance-secrets-ensure-job.yaml b/services/finance/oneoffs/finance-secrets-ensure-job.yaml similarity index 83% rename from services/finance/finance-secrets-ensure-job.yaml rename to services/finance/oneoffs/finance-secrets-ensure-job.yaml index 67f06cb5..e8c8f588 100644 --- a/services/finance/finance-secrets-ensure-job.yaml +++ b/services/finance/oneoffs/finance-secrets-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/finance/finance-secrets-ensure-job.yaml +# services/finance/oneoffs/finance-secrets-ensure-job.yaml +# One-off job for finance/finance-secrets-ensure-5. +# Purpose: finance secrets ensure 5 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: finance-secrets-ensure-5 namespace: finance spec: + suspend: true backoffLimit: 1 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/kustomization.yaml b/services/keycloak/kustomization.yaml index 6030a821..60278910 100644 --- a/services/keycloak/kustomization.yaml +++ b/services/keycloak/kustomization.yaml @@ -10,21 +10,21 @@ resources: - secretproviderclass.yaml - vault-sync-deployment.yaml - deployment.yaml - - realm-settings-job.yaml - - portal-admin-client-secret-ensure-job.yaml - - portal-e2e-client-job.yaml - - portal-e2e-target-client-job.yaml - - portal-e2e-token-exchange-permissions-job.yaml - - portal-e2e-token-exchange-test-job.yaml - - portal-e2e-execute-actions-email-test-job.yaml - - ldap-federation-job.yaml - - user-overrides-job.yaml - - mas-secrets-ensure-job.yaml - - synapse-oidc-secret-ensure-job.yaml - - logs-oidc-secret-ensure-job.yaml - - harbor-oidc-secret-ensure-job.yaml - - vault-oidc-secret-ensure-job.yaml - - actual-oidc-secret-ensure-job.yaml + - oneoffs/realm-settings-job.yaml + - oneoffs/portal-admin-client-secret-ensure-job.yaml + - oneoffs/portal-e2e-client-job.yaml + - oneoffs/portal-e2e-target-client-job.yaml + - oneoffs/portal-e2e-token-exchange-permissions-job.yaml + - oneoffs/portal-e2e-token-exchange-test-job.yaml + - oneoffs/portal-e2e-execute-actions-email-test-job.yaml + - oneoffs/ldap-federation-job.yaml + - oneoffs/user-overrides-job.yaml + - oneoffs/mas-secrets-ensure-job.yaml + - oneoffs/synapse-oidc-secret-ensure-job.yaml + - oneoffs/logs-oidc-secret-ensure-job.yaml + - oneoffs/harbor-oidc-secret-ensure-job.yaml + - oneoffs/vault-oidc-secret-ensure-job.yaml + - oneoffs/actual-oidc-secret-ensure-job.yaml - service.yaml - ingress.yaml generatorOptions: diff --git a/services/keycloak/actual-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml similarity index 83% rename from services/keycloak/actual-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml index 3dadb520..d4da1f1f 100644 --- a/services/keycloak/actual-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/actual-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml +# One-off job for sso/actual-oidc-secret-ensure-3. +# Purpose: actual oidc secret ensure 3 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: actual-oidc-secret-ensure-3 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/harbor-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml similarity index 83% rename from services/keycloak/harbor-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml index 87de4632..c368241b 100644 --- a/services/keycloak/harbor-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/harbor-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml +# One-off job for sso/harbor-oidc-secret-ensure-10. +# Purpose: harbor oidc secret ensure 10 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: harbor-oidc-secret-ensure-10 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/ldap-federation-job.yaml b/services/keycloak/oneoffs/ldap-federation-job.yaml similarity index 98% rename from services/keycloak/ldap-federation-job.yaml rename to services/keycloak/oneoffs/ldap-federation-job.yaml index 3c3f1c19..9e9a5f9b 100644 --- a/services/keycloak/ldap-federation-job.yaml +++ b/services/keycloak/oneoffs/ldap-federation-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/ldap-federation-job.yaml +# services/keycloak/oneoffs/ldap-federation-job.yaml +# One-off job for sso/keycloak-ldap-federation-12. +# Purpose: keycloak ldap federation 12 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-ldap-federation-12 namespace: sso spec: + suspend: true backoffLimit: 2 template: metadata: diff --git a/services/keycloak/logs-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml similarity index 94% rename from services/keycloak/logs-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml index 14e80df5..bce9e5b4 100644 --- a/services/keycloak/logs-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/logs-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml +# One-off job for sso/logs-oidc-secret-ensure-10. +# Purpose: logs oidc secret ensure 10 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: logs-oidc-secret-ensure-10 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/mas-secrets-ensure-job.yaml b/services/keycloak/oneoffs/mas-secrets-ensure-job.yaml similarity index 95% rename from services/keycloak/mas-secrets-ensure-job.yaml rename to services/keycloak/oneoffs/mas-secrets-ensure-job.yaml index 24c9e048..c3bd1be0 100644 --- a/services/keycloak/mas-secrets-ensure-job.yaml +++ b/services/keycloak/oneoffs/mas-secrets-ensure-job.yaml @@ -1,4 +1,8 @@ -# services/keycloak/mas-secrets-ensure-job.yaml +# services/keycloak/oneoffs/mas-secrets-ensure-job.yaml +# One-off job for sso/mas-secrets-ensure. +# Purpose: mas secrets ensure (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: v1 kind: ServiceAccount metadata: @@ -13,6 +17,7 @@ metadata: name: mas-secrets-ensure-21 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/portal-admin-client-secret-ensure-job.yaml b/services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml similarity index 96% rename from services/keycloak/portal-admin-client-secret-ensure-job.yaml rename to services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml index 90dd4b71..1d3e7f37 100644 --- a/services/keycloak/portal-admin-client-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-admin-client-secret-ensure-job.yaml +# services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml +# One-off job for sso/keycloak-portal-admin-secret-ensure-4. +# Purpose: keycloak portal admin secret ensure 4 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-admin-secret-ensure-4 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/portal-e2e-client-job.yaml b/services/keycloak/oneoffs/portal-e2e-client-job.yaml similarity index 97% rename from services/keycloak/portal-e2e-client-job.yaml rename to services/keycloak/oneoffs/portal-e2e-client-job.yaml index 4e0c0062..274dd27b 100644 --- a/services/keycloak/portal-e2e-client-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-client-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-client-job.yaml +# services/keycloak/oneoffs/portal-e2e-client-job.yaml +# One-off job for sso/keycloak-portal-e2e-client-8. +# Purpose: keycloak portal e2e client 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-client-8 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml similarity index 89% rename from services/keycloak/portal-e2e-execute-actions-email-test-job.yaml rename to services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml index 35f79a6b..518d839c 100644 --- a/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-execute-actions-email-test-job.yaml +# services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml +# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14. +# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-execute-actions-email-14 namespace: sso spec: + suspend: true backoffLimit: 3 template: metadata: diff --git a/services/keycloak/portal-e2e-target-client-job.yaml b/services/keycloak/oneoffs/portal-e2e-target-client-job.yaml similarity index 95% rename from services/keycloak/portal-e2e-target-client-job.yaml rename to services/keycloak/oneoffs/portal-e2e-target-client-job.yaml index 196b48bd..900d0290 100644 --- a/services/keycloak/portal-e2e-target-client-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-target-client-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-target-client-job.yaml +# services/keycloak/oneoffs/portal-e2e-target-client-job.yaml +# One-off job for sso/keycloak-portal-e2e-target-7. +# Purpose: keycloak portal e2e target 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-target-7 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml b/services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml similarity index 97% rename from services/keycloak/portal-e2e-token-exchange-permissions-job.yaml rename to services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml index 647b8f9b..0d41b476 100644 --- a/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-token-exchange-permissions-job.yaml +# services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml +# One-off job for sso/keycloak-portal-e2e-token-exchange-permissions-11. +# Purpose: keycloak portal e2e token exchange permissions 11 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-token-exchange-permissions-11 namespace: sso spec: + suspend: true backoffLimit: 6 template: metadata: diff --git a/services/keycloak/portal-e2e-token-exchange-test-job.yaml b/services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml similarity index 89% rename from services/keycloak/portal-e2e-token-exchange-test-job.yaml rename to services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml index edd7555e..eb05e09c 100644 --- a/services/keycloak/portal-e2e-token-exchange-test-job.yaml +++ b/services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/portal-e2e-token-exchange-test-job.yaml +# services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml +# One-off job for sso/keycloak-portal-e2e-token-exchange-test-7. +# Purpose: keycloak portal e2e token exchange test 7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-portal-e2e-token-exchange-test-7 namespace: sso spec: + suspend: true backoffLimit: 6 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/oneoffs/realm-settings-job.yaml similarity index 98% rename from services/keycloak/realm-settings-job.yaml rename to services/keycloak/oneoffs/realm-settings-job.yaml index 9265ca3e..ea88d83f 100644 --- a/services/keycloak/realm-settings-job.yaml +++ b/services/keycloak/oneoffs/realm-settings-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/realm-settings-job.yaml +# services/keycloak/oneoffs/realm-settings-job.yaml +# One-off job for sso/keycloak-realm-settings-36. +# Purpose: keycloak realm settings 36 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-realm-settings-36 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/synapse-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml similarity index 92% rename from services/keycloak/synapse-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml index e808e7e0..15b7a312 100644 --- a/services/keycloak/synapse-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/synapse-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml +# One-off job for sso/synapse-oidc-secret-ensure-10. +# Purpose: synapse oidc secret ensure 10 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: synapse-oidc-secret-ensure-10 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/keycloak/user-overrides-job.yaml b/services/keycloak/oneoffs/user-overrides-job.yaml similarity index 96% rename from services/keycloak/user-overrides-job.yaml rename to services/keycloak/oneoffs/user-overrides-job.yaml index 7623c843..0d52d6d3 100644 --- a/services/keycloak/user-overrides-job.yaml +++ b/services/keycloak/oneoffs/user-overrides-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/user-overrides-job.yaml +# services/keycloak/oneoffs/user-overrides-job.yaml +# One-off job for sso/keycloak-user-overrides-9. +# Purpose: keycloak user overrides 9 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: keycloak-user-overrides-9 namespace: sso spec: + suspend: true backoffLimit: 0 template: metadata: diff --git a/services/keycloak/vault-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml similarity index 83% rename from services/keycloak/vault-oidc-secret-ensure-job.yaml rename to services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml index 3aa3ca55..a76c52e9 100644 --- a/services/keycloak/vault-oidc-secret-ensure-job.yaml +++ b/services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml @@ -1,10 +1,15 @@ -# services/keycloak/vault-oidc-secret-ensure-job.yaml +# services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml +# One-off job for sso/vault-oidc-secret-ensure-8. +# Purpose: vault oidc secret ensure 8 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: vault-oidc-secret-ensure-8 namespace: sso spec: + suspend: true backoffLimit: 0 ttlSecondsAfterFinished: 3600 template: diff --git a/services/logging/kustomization.yaml b/services/logging/kustomization.yaml index 08c73a8d..dc487155 100644 --- a/services/logging/kustomization.yaml +++ b/services/logging/kustomization.yaml @@ -15,9 +15,9 @@ resources: - opensearch-dashboards-helmrelease.yaml - data-prepper-helmrelease.yaml - otel-collector-helmrelease.yaml - - opensearch-ism-job.yaml - - opensearch-dashboards-setup-job.yaml - - opensearch-observability-setup-job.yaml + - oneoffs/opensearch-ism-job.yaml + - oneoffs/opensearch-dashboards-setup-job.yaml + - oneoffs/opensearch-observability-setup-job.yaml - opensearch-prune-cronjob.yaml - fluent-bit-helmrelease.yaml - node-log-rotation-daemonset.yaml diff --git a/services/logging/opensearch-dashboards-setup-job.yaml b/services/logging/oneoffs/opensearch-dashboards-setup-job.yaml similarity index 88% rename from services/logging/opensearch-dashboards-setup-job.yaml rename to services/logging/oneoffs/opensearch-dashboards-setup-job.yaml index 06149d79..1d1a9b68 100644 --- a/services/logging/opensearch-dashboards-setup-job.yaml +++ b/services/logging/oneoffs/opensearch-dashboards-setup-job.yaml @@ -1,10 +1,15 @@ -# services/logging/opensearch-dashboards-setup-job.yaml +# services/logging/oneoffs/opensearch-dashboards-setup-job.yaml +# One-off job for logging/opensearch-dashboards-setup-4. +# Purpose: opensearch dashboards setup 4 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: opensearch-dashboards-setup-4 namespace: logging spec: + suspend: true backoffLimit: 3 ttlSecondsAfterFinished: 3600 template: diff --git a/services/logging/opensearch-ism-job.yaml b/services/logging/oneoffs/opensearch-ism-job.yaml similarity index 91% rename from services/logging/opensearch-ism-job.yaml rename to services/logging/oneoffs/opensearch-ism-job.yaml index 3313571b..476bca7a 100644 --- a/services/logging/opensearch-ism-job.yaml +++ b/services/logging/oneoffs/opensearch-ism-job.yaml @@ -1,10 +1,15 @@ -# services/logging/opensearch-ism-job.yaml +# services/logging/oneoffs/opensearch-ism-job.yaml +# One-off job for logging/opensearch-ism-setup-5. +# Purpose: opensearch ism setup 5 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: opensearch-ism-setup-5 namespace: logging spec: + suspend: true backoffLimit: 3 ttlSecondsAfterFinished: 3600 template: diff --git a/services/logging/opensearch-observability-setup-job.yaml b/services/logging/oneoffs/opensearch-observability-setup-job.yaml similarity index 76% rename from services/logging/opensearch-observability-setup-job.yaml rename to services/logging/oneoffs/opensearch-observability-setup-job.yaml index e4590fb5..6caa0765 100644 --- a/services/logging/opensearch-observability-setup-job.yaml +++ b/services/logging/oneoffs/opensearch-observability-setup-job.yaml @@ -1,10 +1,15 @@ -# services/logging/opensearch-observability-setup-job.yaml +# services/logging/oneoffs/opensearch-observability-setup-job.yaml +# One-off job for logging/opensearch-observability-setup-2. +# Purpose: opensearch observability setup 2 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: opensearch-observability-setup-2 namespace: logging spec: + suspend: true backoffLimit: 3 ttlSecondsAfterFinished: 3600 template: diff --git a/services/mailu/kustomization.yaml b/services/mailu/kustomization.yaml index 7447f24a..3e0494ee 100644 --- a/services/mailu/kustomization.yaml +++ b/services/mailu/kustomization.yaml @@ -13,7 +13,7 @@ resources: - unbound-configmap.yaml - serverstransport.yaml - ingressroute.yaml - - mailu-sync-job.yaml + - oneoffs/mailu-sync-job.yaml - mailu-sync-cronjob.yaml - front-lb.yaml diff --git a/services/mailu/mailu-sync-job.yaml b/services/mailu/oneoffs/mailu-sync-job.yaml similarity index 93% rename from services/mailu/mailu-sync-job.yaml rename to services/mailu/oneoffs/mailu-sync-job.yaml index 8589e9ee..38648acc 100644 --- a/services/mailu/mailu-sync-job.yaml +++ b/services/mailu/oneoffs/mailu-sync-job.yaml @@ -1,10 +1,15 @@ -# services/mailu/mailu-sync-job.yaml +# services/mailu/oneoffs/mailu-sync-job.yaml +# One-off job for mailu-mailserver/mailu-sync-9. +# Purpose: mailu sync 9 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: mailu-sync-9 namespace: mailu-mailserver spec: + suspend: true template: metadata: annotations: diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index a1ca5831..19b2ba98 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -14,10 +14,10 @@ resources: - node-nofile-serviceaccount.yaml - pod-cleaner-rbac.yaml - ariadne-deployment.yaml - - ariadne-migrate-job.yaml + - oneoffs/ariadne-migrate-job.yaml - ariadne-service.yaml - disable-k3s-traefik-daemonset.yaml - - k3s-traefik-cleanup-job.yaml + - oneoffs/k3s-traefik-cleanup-job.yaml - node-nofile-daemonset.yaml - k3s-agent-restart-daemonset.yaml - pod-cleaner-cronjob.yaml diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/oneoffs/ariadne-migrate-job.yaml similarity index 82% rename from services/maintenance/ariadne-migrate-job.yaml rename to services/maintenance/oneoffs/ariadne-migrate-job.yaml index 367a1a09..ecac68d4 100644 --- a/services/maintenance/ariadne-migrate-job.yaml +++ b/services/maintenance/oneoffs/ariadne-migrate-job.yaml @@ -1,4 +1,8 @@ -# services/maintenance/ariadne-migrate-job.yaml +# services/maintenance/oneoffs/ariadne-migrate-job.yaml +# One-off job for maintenance/ariadne-migrate-2. +# Purpose: ariadne migrate 2 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: diff --git a/services/maintenance/k3s-traefik-cleanup-job.yaml b/services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml similarity index 77% rename from services/maintenance/k3s-traefik-cleanup-job.yaml rename to services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml index d5d12a65..2c365a95 100644 --- a/services/maintenance/k3s-traefik-cleanup-job.yaml +++ b/services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml @@ -1,10 +1,15 @@ -# services/maintenance/k3s-traefik-cleanup-job.yaml +# services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml +# One-off job for maintenance/k3s-traefik-cleanup-2. +# Purpose: k3s traefik cleanup 2 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: k3s-traefik-cleanup-2 namespace: maintenance spec: + suspend: true backoffLimit: 1 template: spec: diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 59530390..23c1595a 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -23,8 +23,8 @@ resources: - grafana-alerting-config.yaml - grafana-folders.yaml - helmrelease.yaml - - grafana-org-bootstrap.yaml - - grafana-user-dedupe-job.yaml + - oneoffs/grafana-org-bootstrap.yaml + - oneoffs/grafana-user-dedupe-job.yaml configMapGenerator: - name: postmark-exporter-script diff --git a/services/monitoring/grafana-org-bootstrap.yaml b/services/monitoring/oneoffs/grafana-org-bootstrap.yaml similarity index 93% rename from services/monitoring/grafana-org-bootstrap.yaml rename to services/monitoring/oneoffs/grafana-org-bootstrap.yaml index f1d40755..6f824cc5 100644 --- a/services/monitoring/grafana-org-bootstrap.yaml +++ b/services/monitoring/oneoffs/grafana-org-bootstrap.yaml @@ -1,10 +1,15 @@ -# services/monitoring/grafana-org-bootstrap.yaml +# services/monitoring/oneoffs/grafana-org-bootstrap.yaml +# One-off job for monitoring/grafana-org-bootstrap-3. +# Purpose: grafana org bootstrap 3 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: grafana-org-bootstrap-3 namespace: monitoring spec: + suspend: true backoffLimit: 2 template: metadata: diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml similarity index 94% rename from services/monitoring/grafana-user-dedupe-job.yaml rename to services/monitoring/oneoffs/grafana-user-dedupe-job.yaml index 8ab1a665..8194f186 100644 --- a/services/monitoring/grafana-user-dedupe-job.yaml +++ b/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml @@ -1,10 +1,15 @@ -# services/monitoring/grafana-user-dedupe-job.yaml +# services/monitoring/oneoffs/grafana-user-dedupe-job.yaml +# One-off job for monitoring/grafana-user-dedupe-api-v7. +# Purpose: grafana user dedupe api v7 (see container args/env in this file). +# Run by setting spec.suspend to false, reconcile, then set it back to true. +# Safe to delete the finished Job/pod; it should not run continuously. apiVersion: batch/v1 kind: Job metadata: name: grafana-user-dedupe-api-v7 namespace: monitoring spec: + suspend: true backoffLimit: 1 template: metadata: From c8a9761ed3e9f6af9cb93cb5cf43d43738ec147e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:52:23 -0300 Subject: [PATCH 395/416] atlasbot: simplify fast path --- services/comms/scripts/atlasbot/bot.py | 32 ++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 50fed4b6..d0d46efe 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3605,10 +3605,33 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: if mode == "fast": - return 4 + return 2 return 7 +def _open_ended_fast_single( + prompt: str, + *, + fact_pack: str, + history_lines: list[str], + state: ThoughtState | None = None, + model: str, +) -> str: + if state: + state.update("drafting", step=2, note="summarizing") + context = fact_pack + reply = _ollama_call( + ("atlasbot_fast", "atlasbot_fast"), + prompt, + context=context, + use_history=False, + model=model, + ) + if state: + state.update("done", step=_open_ended_total_steps("fast")) + return _ensure_scores(reply) + + def _open_ended_fast( prompt: str, *, @@ -3618,14 +3641,13 @@ def _open_ended_fast( history_lines: list[str], state: ThoughtState | None = None, ) -> str: - return _open_ended_multi( + model = _model_for_mode("fast") + return _open_ended_fast_single( prompt, fact_pack=fact_pack, - fact_lines=fact_lines, - fact_meta=fact_meta, history_lines=history_lines, - mode="fast", state=state, + model=model, ) From fdfc1f5857892e8b1e51316b38f7e46179c532db Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:52:40 -0300 Subject: [PATCH 396/416] comms: roll atlasbot config --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 1d89335a..8607858a 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-87 + checksum/atlasbot-configmap: manual-atlasbot-88 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From a64bec005350ec3d09c3445e6cb8120965debbdb Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:58:07 -0300 Subject: [PATCH 397/416] atlasbot: tighten fast facts --- services/comms/scripts/atlasbot/bot.py | 39 +++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index d0d46efe..c44c7da3 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3605,10 +3605,25 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: if mode == "fast": - return 2 + return 3 return 7 +def _fast_fact_lines( + fact_lines: list[str], + fact_meta: dict[str, dict[str, Any]], + fact_ids: list[str], +) -> list[str]: + if not fact_ids: + return fact_lines + selected = [ + line + for line in fact_lines + if fact_meta.get(line, {}).get("id") in set(fact_ids) + ] + return selected or fact_lines + + def _open_ended_fast_single( prompt: str, *, @@ -3642,6 +3657,27 @@ def _open_ended_fast( state: ThoughtState | None = None, ) -> str: model = _model_for_mode("fast") + if state: + state.update("selecting", step=2, note="picking key facts") + subjective = _is_subjective_query(prompt) + focus_tags = _preferred_tags_for_prompt(prompt) + if not focus_tags and subjective: + focus_tags = set(_ALLOWED_INSIGHT_TAGS) + primary_ids = _open_ended_select_facts( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + focus_tags=focus_tags, + avoid_fact_ids=[], + count=3, + subjective=subjective, + state=state, + step=2, + model=model, + ) + selected_lines = _fast_fact_lines(fact_lines, fact_meta, primary_ids) + fact_pack = _fact_pack_text(selected_lines, fact_meta) return _open_ended_fast_single( prompt, fact_pack=fact_pack, @@ -4092,6 +4128,7 @@ def _ollama_call( "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " "Translate metrics into natural language instead of echoing raw label/value pairs. " + "When providing counts or totals, use the exact numbers from the context; do not invent or truncate. " "Avoid bare lists unless the user asked for a list; weave numbers into sentences. " "Do not answer by only listing runbooks; if the question is about Atlas/Othrys, summarize the cluster first and mention docs only if useful. " "If the question is not about Atlas/Othrys and no cluster context is provided, answer using general knowledge and say when you are unsure. " From 4effe8d7129aa81e818638e447512b68f2d18712 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 01:58:23 -0300 Subject: [PATCH 398/416] comms: roll atlasbot config --- services/comms/atlasbot-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 8607858a..a7fbea9c 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-88 + checksum/atlasbot-configmap: manual-atlasbot-89 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" From 00e4b9e9e93eb1bc307e4fdae0d9591a43156bd7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 02:21:42 -0300 Subject: [PATCH 399/416] atlasbot: rework reasoning pipeline --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 286 +++++++++++++++++------- 2 files changed, 210 insertions(+), 78 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index a7fbea9c..c9602c32 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-89 + checksum/atlasbot-configmap: manual-atlasbot-90 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index c44c7da3..ffc8a5c8 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -16,7 +16,7 @@ PASSWORD = os.environ["BOT_PASS"] ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") -MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") +MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5:14b-instruct") MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "") MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "") FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "") @@ -2895,6 +2895,7 @@ def _open_ended_system() -> str: "Use ONLY the provided fact pack and recent chat as your evidence. " "You may draw light inferences if you label them as such. " "Write concise, human sentences with a helpful, calm tone (not a list). " + "Be willing to take a light stance; do not over-hedge. " "If the question is subjective (cool/interesting/unconventional), pick a standout fact and explain why it stands out. " "If the question asks for a list, embed the list inline in a sentence (comma-separated). " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " @@ -3002,20 +3003,27 @@ def _open_ended_plan( *, fact_pack: str, history_lines: list[str], + focus_tags: set[str], + avoid_tags: set[str], count: int, state: ThoughtState | None, + step: int, model: str | None, ) -> list[dict[str, Any]]: if state: - state.update("planning", step=1, note="mapping angles") + state.update("planning", step=step, note="mapping angles") count = max(1, count) + focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any" + avoid_hint = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" prompt_text = ( "Analyze the question and propose up to " f"{count} distinct answer angles that can be supported by the fact pack. " "Keep them diverse (e.g., metrics, hardware, workload placement, recent changes). " "If the question is subjective, propose at least one angle that surfaces a standout detail. " + f"Prefer angles that align with these tags: {focus_hint}. " + f"Avoid angles that overlap these tags if possible: {avoid_hint}. " "Avoid repeating the same angle as the most recent response if possible. " - "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"priority\":1-5}]}." + "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"tags\":[\"tag\"],\"priority\":1-5}]}." ) context = _append_history_context(fact_pack, history_lines) result = _ollama_json_call( @@ -3037,10 +3045,12 @@ def _open_ended_plan( priority = item.get("priority") if not isinstance(priority, (int, float)): priority = 3 + tags = _sanitize_focus_tags(item.get("tags") or []) cleaned.append( { "focus": focus, "reason": str(item.get("reason") or ""), + "tags": tags, "priority": int(max(1, min(5, priority))), } ) @@ -3131,6 +3141,35 @@ def _preferred_tags_for_prompt(prompt: str) -> set[str]: return tags & _ALLOWED_INSIGHT_TAGS +_TAG_KEYWORDS: dict[str, tuple[str, ...]] = { + "utilization": ("cpu", "ram", "memory", "net", "network", "io", "disk", "usage", "utilization", "hottest", "busy"), + "database": ("postgres", "db", "database", "connections"), + "pods": ("pod", "pods", "deployment", "daemonset", "job", "cron", "workload"), + "hardware": ("hardware", "architecture", "arch", "rpi", "raspberry", "jetson", "amd64", "arm64", "node", "nodes"), + "availability": ("ready", "not ready", "unready", "down", "missing"), + "workloads": ("workload", "service", "namespace", "app"), + "os": ("os", "kernel", "kubelet", "containerd", "runtime"), +} + + +def _tags_from_text(text: str) -> set[str]: + q = normalize_query(text) + if not q: + return set() + tags: set[str] = set() + for tag, keywords in _TAG_KEYWORDS.items(): + if any(word in q for word in keywords): + tags.add(tag) + return tags & _ALLOWED_INSIGHT_TAGS + + +def _history_focus_tags(history_lines: list[str]) -> set[str]: + if not history_lines: + return set() + recent = " ".join(line for line in history_lines[-6:] if isinstance(line, str)) + return _tags_from_text(recent) + + def _open_ended_insights( prompt: str, *, @@ -3139,10 +3178,11 @@ def _open_ended_insights( history_lines: list[str], count: int, state: ThoughtState | None, + step: int, model: str | None, ) -> list[dict[str, Any]]: if state: - state.update("analyzing", note="scouting insights") + state.update("analyzing", step=step, note="scouting insights") count = max(1, count) allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS)) prompt_text = ( @@ -3188,10 +3228,35 @@ def _open_ended_insights( return cleaned +def _rank_insights( + insights: list[dict[str, Any]], + *, + focus_tags: set[str], + avoid_tags: set[str], + count: int, +) -> list[dict[str, Any]]: + if not insights: + return [] + ranked: list[tuple[float, dict[str, Any]]] = [] + for insight in insights: + relevance = _normalize_fraction(insight.get("relevance"), default=0.6) + novelty = _normalize_fraction(insight.get("novelty"), default=0.5) + tags = set(insight.get("tags") or []) + score = relevance * 0.65 + novelty * 0.35 + if focus_tags and tags & focus_tags: + score += 0.1 + if avoid_tags and tags & avoid_tags: + score -= 0.2 + ranked.append((score, insight)) + ranked.sort(key=lambda item: item[0], reverse=True) + return [item for _, item in ranked[:count]] + + def _fallback_fact_ids( fact_meta: dict[str, dict[str, Any]], *, focus_tags: set[str], + avoid_tags: set[str], count: int, ) -> list[str]: if not fact_meta: @@ -3202,9 +3267,16 @@ def _fallback_fact_ids( for fid, meta in fact_meta.items() if focus_tags & set(meta.get("tags") or []) ] + if avoid_tags: + tagged = [fid for fid in tagged if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))] if tagged: return tagged[:count] - return list(fact_meta.keys())[:count] + all_ids = list(fact_meta.keys()) + if avoid_tags: + filtered = [fid for fid in all_ids if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))] + if filtered: + return filtered[:count] + return all_ids[:count] def _open_ended_select_facts( @@ -3214,6 +3286,7 @@ def _open_ended_select_facts( fact_meta: dict[str, dict[str, Any]], history_lines: list[str], focus_tags: set[str], + avoid_tags: set[str], avoid_fact_ids: list[str], count: int, subjective: bool, @@ -3224,11 +3297,13 @@ def _open_ended_select_facts( if state: state.update("selecting facts", step=step, note="picking evidence") focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any" + avoid_tag_hint = ", ".join(sorted(avoid_tags)) if avoid_tags else "none" avoid_hint = ", ".join(avoid_fact_ids) if avoid_fact_ids else "none" prompt_text = ( "Select the fact IDs that best answer the question. " f"Pick up to {count} fact IDs. " f"Focus tags: {focus_hint}. " + f"Avoid these tags if possible: {avoid_tag_hint}. " f"Avoid these fact IDs: {avoid_hint}. " "If the question is subjective, pick standout or unusual facts; " "if objective, pick the minimal facts needed. " @@ -3248,7 +3323,18 @@ def _open_ended_select_facts( selected.append(fid) if len(selected) >= count: break - seed = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count) + if avoid_tags: + selected = [ + fid + for fid in selected + if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or [])) + ] or selected + seed = _fallback_fact_ids( + fact_meta, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + count=count, + ) if selected: for fid in seed: if fid not in selected: @@ -3483,7 +3569,7 @@ def _open_ended_multi( if mode == "fast": total_steps = 4 else: - total_steps = 7 + total_steps = 9 if state: state.total_steps = total_steps @@ -3503,41 +3589,25 @@ def _open_ended_multi( focus_tags = set(interpretation.get("focus_tags") or []) or _preferred_tags_for_prompt(prompt) if not focus_tags and subjective: focus_tags = set(_ALLOWED_INSIGHT_TAGS) + avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set() - primary_ids = _open_ended_select_facts( - prompt, - fact_pack=fact_pack, - fact_meta=fact_meta, - history_lines=history_lines, - focus_tags=focus_tags, - avoid_fact_ids=[], - count=4 if mode == "deep" else 3, - subjective=subjective, - state=state, - step=2, - model=model, - ) - alternate_ids: list[str] = [] - if mode == "deep": - alternate_ids = _open_ended_select_facts( + if mode == "fast": + primary_ids = _open_ended_select_facts( prompt, fact_pack=fact_pack, fact_meta=fact_meta, history_lines=history_lines, focus_tags=focus_tags, - avoid_fact_ids=primary_ids, - count=4, + avoid_tags=avoid_tags, + avoid_fact_ids=[], + count=3, subjective=subjective, state=state, - step=3, + step=2, model=model, ) - - candidates: list[dict[str, Any]] = [] - focus_label = interpretation.get("focus_label") or "primary angle" - step = 3 if mode == "fast" else 4 - candidates.append( - _open_ended_candidate( + focus_label = interpretation.get("focus_label") or "primary angle" + candidate = _open_ended_candidate( prompt, focus=str(focus_label), fact_pack=fact_pack, @@ -3546,17 +3616,65 @@ def _open_ended_multi( tone=str(tone), allow_list=allow_list, state=state, - step=step, + step=3, fact_hints=primary_ids, model=model, ) + reply = _open_ended_synthesize( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + candidates=[candidate], + subjective=subjective, + tone=str(tone), + allow_list=allow_list, + state=state, + step=4, + model=model, + critique=None, + ) + if state: + state.update("done", step=total_steps) + return reply + + angles = _open_ended_plan( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + count=5, + state=state, + step=2, + model=model, ) - step += 1 - if mode == "deep" and alternate_ids: + if state and avoid_tags: + state.update("planning", step=2, note=f"avoiding {', '.join(sorted(avoid_tags))}") + + insights = _open_ended_insights( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + count=7, + state=state, + step=3, + model=model, + ) + ranked_insights = _rank_insights( + insights, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + count=3, + ) + + candidates: list[dict[str, Any]] = [] + step = 4 + for insight in ranked_insights: candidates.append( _open_ended_candidate( prompt, - focus="alternate angle", + focus=insight.get("summary") or "insight", fact_pack=fact_pack, history_lines=history_lines, subjective=subjective, @@ -3564,27 +3682,61 @@ def _open_ended_multi( allow_list=allow_list, state=state, step=step, - fact_hints=alternate_ids, + fact_hints=insight.get("fact_ids") or [], model=model, ) ) step += 1 + if not candidates and angles: + for angle in angles[:2]: + angle_tags = set(angle.get("tags") or []) or _tags_from_text(angle.get("focus") or "") + fact_ids = _open_ended_select_facts( + prompt, + fact_pack=fact_pack, + fact_meta=fact_meta, + history_lines=history_lines, + focus_tags=angle_tags or focus_tags, + avoid_tags=avoid_tags, + avoid_fact_ids=[], + count=4, + subjective=subjective, + state=state, + step=step, + model=model, + ) + candidates.append( + _open_ended_candidate( + prompt, + focus=angle.get("focus") or "alternate angle", + fact_pack=fact_pack, + history_lines=history_lines, + subjective=subjective, + tone=str(tone), + allow_list=allow_list, + state=state, + step=step, + fact_hints=fact_ids, + model=model, + ) + ) + step += 1 + if len(candidates) >= 2: + break + if state: state.update("evaluating", step=step, note="ranking candidates") - selected = _select_candidates(candidates, count=1 if mode == "fast" else 2) + selected = _select_candidates(candidates, count=2) + step += 1 + critique = _open_ended_critique( + prompt, + fact_pack=fact_pack, + history_lines=history_lines, + candidates=selected or candidates, + state=state, + step=step, + model=model, + ) step += 1 - critique = "" - if mode == "deep": - critique = _open_ended_critique( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - candidates=selected or candidates, - state=state, - step=step, - model=model, - ) - step += 1 reply = _open_ended_synthesize( prompt, fact_pack=fact_pack, @@ -3605,8 +3757,8 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: if mode == "fast": - return 3 - return 7 + return 4 + return 9 def _fast_fact_lines( @@ -3656,34 +3808,14 @@ def _open_ended_fast( history_lines: list[str], state: ThoughtState | None = None, ) -> str: - model = _model_for_mode("fast") - if state: - state.update("selecting", step=2, note="picking key facts") - subjective = _is_subjective_query(prompt) - focus_tags = _preferred_tags_for_prompt(prompt) - if not focus_tags and subjective: - focus_tags = set(_ALLOWED_INSIGHT_TAGS) - primary_ids = _open_ended_select_facts( + return _open_ended_multi( prompt, fact_pack=fact_pack, + fact_lines=fact_lines, fact_meta=fact_meta, history_lines=history_lines, - focus_tags=focus_tags, - avoid_fact_ids=[], - count=3, - subjective=subjective, + mode="fast", state=state, - step=2, - model=model, - ) - selected_lines = _fast_fact_lines(fact_lines, fact_meta, primary_ids) - fact_pack = _fact_pack_text(selected_lines, fact_meta) - return _open_ended_fast_single( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - state=state, - model=model, ) @@ -3846,7 +3978,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): if cluster_query: context = build_context( cleaned, - allow_tools=False, + allow_tools=True, targets=[], inventory=inventory, snapshot=snapshot, @@ -3860,7 +3992,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): workloads=workloads, history_lines=history_lines, mode=mode, - allow_tools=False, + allow_tools=True, state=None, ) else: From 7a7d96ba2127d10d1c9dfccd10d50793218a10d9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 02:43:24 -0300 Subject: [PATCH 400/416] comms: tune atlasbot quick model --- services/comms/atlasbot-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index c9602c32..d570fd9a 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-90 + checksum/atlasbot-configmap: manual-atlasbot-91 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -84,7 +84,7 @@ spec: - name: OLLAMA_MODEL value: qwen2.5:14b-instruct - name: ATLASBOT_MODEL_FAST - value: qwen2.5:14b-instruct + value: qwen2.5:7b-instruct-q4_0 - name: ATLASBOT_MODEL_DEEP value: qwen2.5:14b-instruct - name: OLLAMA_FALLBACK_MODEL From e1505873d39b33a187311ed962ded802276d0c9a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 02:53:43 -0300 Subject: [PATCH 401/416] atlasbot: streamline quick answers --- services/comms/scripts/atlasbot/bot.py | 120 ++++++++++--------------- 1 file changed, 47 insertions(+), 73 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index ffc8a5c8..6f18b9ea 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3562,14 +3562,10 @@ def _open_ended_multi( fact_lines: list[str], fact_meta: dict[str, dict[str, Any]], history_lines: list[str], - mode: str, state: ThoughtState | None = None, ) -> str: - model = _model_for_mode(mode) - if mode == "fast": - total_steps = 4 - else: - total_steps = 9 + model = _model_for_mode("deep") + total_steps = _open_ended_total_steps("deep") if state: state.total_steps = total_steps @@ -3591,52 +3587,6 @@ def _open_ended_multi( focus_tags = set(_ALLOWED_INSIGHT_TAGS) avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set() - if mode == "fast": - primary_ids = _open_ended_select_facts( - prompt, - fact_pack=fact_pack, - fact_meta=fact_meta, - history_lines=history_lines, - focus_tags=focus_tags, - avoid_tags=avoid_tags, - avoid_fact_ids=[], - count=3, - subjective=subjective, - state=state, - step=2, - model=model, - ) - focus_label = interpretation.get("focus_label") or "primary angle" - candidate = _open_ended_candidate( - prompt, - focus=str(focus_label), - fact_pack=fact_pack, - history_lines=history_lines, - subjective=subjective, - tone=str(tone), - allow_list=allow_list, - state=state, - step=3, - fact_hints=primary_ids, - model=model, - ) - reply = _open_ended_synthesize( - prompt, - fact_pack=fact_pack, - history_lines=history_lines, - candidates=[candidate], - subjective=subjective, - tone=str(tone), - allow_list=allow_list, - state=state, - step=4, - model=model, - critique=None, - ) - if state: - state.update("done", step=total_steps) - return reply - angles = _open_ended_plan( prompt, fact_pack=fact_pack, @@ -3757,41 +3707,52 @@ def _open_ended_multi( def _open_ended_total_steps(mode: str) -> int: if mode == "fast": - return 4 + return 2 return 9 def _fast_fact_lines( fact_lines: list[str], fact_meta: dict[str, dict[str, Any]], - fact_ids: list[str], + *, + focus_tags: set[str], + avoid_tags: set[str], + limit: int = 10, ) -> list[str]: - if not fact_ids: - return fact_lines - selected = [ - line - for line in fact_lines - if fact_meta.get(line, {}).get("id") in set(fact_ids) - ] - return selected or fact_lines + if not fact_lines: + return [] + selected: list[str] = [] + for idx, line in enumerate(fact_lines): + fid = f"F{idx + 1}" + tags = set(fact_meta.get(fid, {}).get("tags") or []) + if focus_tags and not (focus_tags & tags): + continue + if avoid_tags and (avoid_tags & tags): + continue + selected.append(line) + if len(selected) >= limit: + break + if selected: + return selected + trimmed = fact_lines[:limit] + return trimmed or fact_lines def _open_ended_fast_single( prompt: str, *, - fact_pack: str, - history_lines: list[str], + context: str, state: ThoughtState | None = None, model: str, ) -> str: if state: - state.update("drafting", step=2, note="summarizing") - context = fact_pack + state.update("drafting", step=1, note="summarizing") reply = _ollama_call( ("atlasbot_fast", "atlasbot_fast"), prompt, context=context, use_history=False, + system_override=_open_ended_system(), model=model, ) if state: @@ -3808,14 +3769,28 @@ def _open_ended_fast( history_lines: list[str], state: ThoughtState | None = None, ) -> str: - return _open_ended_multi( + model = _model_for_mode("fast") + subjective = _is_subjective_query(prompt) + focus_tags = _preferred_tags_for_prompt(prompt) + if not focus_tags and subjective: + focus_tags = set(_ALLOWED_INSIGHT_TAGS) + avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set() + selected_lines = _fast_fact_lines( + fact_lines, + fact_meta, + focus_tags=focus_tags, + avoid_tags=avoid_tags, + ) + selected_meta = _fact_pack_meta(selected_lines) + selected_pack = _fact_pack_text(selected_lines, selected_meta) + context = _append_history_context(selected_pack, history_lines) + if state: + state.total_steps = _open_ended_total_steps("fast") + return _open_ended_fast_single( prompt, - fact_pack=fact_pack, - fact_lines=fact_lines, - fact_meta=fact_meta, - history_lines=history_lines, - mode="fast", + context=context, state=state, + model=model, ) @@ -3834,7 +3809,6 @@ def _open_ended_deep( fact_lines=fact_lines, fact_meta=fact_meta, history_lines=history_lines, - mode="deep", state=state, ) From b257e4fc10f79affe10ce6547f6b2e02f2c465c0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:09:34 -0300 Subject: [PATCH 402/416] atlasbot: enrich fact pack summaries --- services/comms/scripts/atlasbot/bot.py | 28 ++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6f18b9ea..96765b13 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -1037,6 +1037,11 @@ def facts_context( nodes_list = by_hardware.get(key) or [] if nodes_list: lines.append(f"- {key}: {', '.join(nodes_list)}") + if by_hardware: + counts = {key: len(nodes_list) for key, nodes_list in by_hardware.items() if nodes_list} + if counts: + parts = [f"{key}={count}" for key, count in sorted(counts.items())] + lines.append(f"- nodes_by_hardware_count: {', '.join(parts)}") non_rpi = sorted(set(by_hardware.get("jetson", [])) | set(by_hardware.get("amd64", []))) if non_rpi: lines.append(f"- non_raspberry_pi: {', '.join(non_rpi)}") @@ -1096,6 +1101,25 @@ def facts_context( value = metrics.get(key) if value is not None: lines.append(f"- {key}: {value}") + if workloads: + ns_counts: dict[str, int] = collections.defaultdict(int) + for entry in workloads: + if not isinstance(entry, dict): + continue + ns = entry.get("namespace") or "" + pods = entry.get("pods_running") + if pods is None: + pods = entry.get("pods_total") + try: + pods_val = int(pods) + except (TypeError, ValueError): + pods_val = 0 + if ns: + ns_counts[ns] += pods_val + if ns_counts: + top_ns = sorted(ns_counts.items(), key=lambda item: item[1], reverse=True)[:5] + parts = [f"{ns}={count}" for ns, count in top_ns] + lines.append(f"- pods_by_namespace: {', '.join(parts)}") top_restarts = metrics.get("top_restarts_1h") if isinstance(metrics.get("top_restarts_1h"), list) else [] if top_restarts: @@ -2725,6 +2749,8 @@ def _fact_line_tags(line: str) -> set[str]: tags.add("database") if "pods_" in text or "pod phases" in text or "restarts" in text: tags.add("pods") + if "namespace" in text: + tags.add("workloads") if "workloads" in text or "primary_node" in text or "workload_" in text: tags.add("workloads") if "node_details" in text: @@ -2900,6 +2926,8 @@ def _open_ended_system() -> str: "If the question asks for a list, embed the list inline in a sentence (comma-separated). " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " + "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. " + "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. " "Do not invent numbers or facts. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)." ) From c2916e60c15cf90f3128cf7ce00cc4cf83314d77 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:14:12 -0300 Subject: [PATCH 403/416] atlasbot: prioritize fact selection for quick answers --- services/comms/scripts/atlasbot/bot.py | 56 +++++++++++++++++++++----- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 96765b13..43f578b0 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3169,6 +3169,23 @@ def _preferred_tags_for_prompt(prompt: str) -> set[str]: return tags & _ALLOWED_INSIGHT_TAGS +def _primary_tags_for_prompt(prompt: str) -> set[str]: + q = normalize_query(prompt) + if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")): + return {"utilization"} + if any(word in q for word in ("postgres", "database", "db", "connections")): + return {"database"} + if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")): + return {"pods"} + if any(word in q for word in ("workload", "service", "namespace")): + return {"workloads"} + if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")): + return {"availability"} + if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")): + return {"hardware", "inventory", "architecture"} + return set() + + _TAG_KEYWORDS: dict[str, tuple[str, ...]] = { "utilization": ("cpu", "ram", "memory", "net", "network", "io", "disk", "usage", "utilization", "hottest", "busy"), "database": ("postgres", "db", "database", "connections"), @@ -3745,25 +3762,43 @@ def _fast_fact_lines( *, focus_tags: set[str], avoid_tags: set[str], + primary_tags: set[str] | None = None, limit: int = 10, ) -> list[str]: if not fact_lines: return [] - selected: list[str] = [] + primary_tags = primary_tags or set() + scored: list[tuple[int, int, str]] = [] for idx, line in enumerate(fact_lines): fid = f"F{idx + 1}" tags = set(fact_meta.get(fid, {}).get("tags") or []) - if focus_tags and not (focus_tags & tags): - continue if avoid_tags and (avoid_tags & tags): continue - selected.append(line) + score = 0 + if primary_tags: + score += 4 * len(tags & primary_tags) + if focus_tags: + score += 2 * len(tags & focus_tags) + scored.append((score, idx, line)) + scored.sort(key=lambda item: (-item[0], item[1])) + selected: list[str] = [] + for score, _, line in scored: + if score <= 0 and selected: + break + if score > 0: + selected.append(line) if len(selected) >= limit: break - if selected: - return selected - trimmed = fact_lines[:limit] - return trimmed or fact_lines + if not selected: + selected = [line for _, _, line in scored[:limit]] + elif len(selected) < limit: + for _, _, line in scored: + if line in selected: + continue + selected.append(line) + if len(selected) >= limit: + break + return selected def _open_ended_fast_single( @@ -3799,6 +3834,7 @@ def _open_ended_fast( ) -> str: model = _model_for_mode("fast") subjective = _is_subjective_query(prompt) + primary_tags = _primary_tags_for_prompt(prompt) focus_tags = _preferred_tags_for_prompt(prompt) if not focus_tags and subjective: focus_tags = set(_ALLOWED_INSIGHT_TAGS) @@ -3808,15 +3844,15 @@ def _open_ended_fast( fact_meta, focus_tags=focus_tags, avoid_tags=avoid_tags, + primary_tags=primary_tags, ) selected_meta = _fact_pack_meta(selected_lines) selected_pack = _fact_pack_text(selected_lines, selected_meta) - context = _append_history_context(selected_pack, history_lines) if state: state.total_steps = _open_ended_total_steps("fast") return _open_ended_fast_single( prompt, - context=context, + context=selected_pack, state=state, model=model, ) From 483c3566f4d1497bbc87bfa4bb67718c7d9e4d15 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:17:46 -0300 Subject: [PATCH 404/416] atlasbot: enforce fast answer body --- services/comms/scripts/atlasbot/bot.py | 28 ++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 43f578b0..7d47423f 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2926,6 +2926,7 @@ def _open_ended_system() -> str: "If the question asks for a list, embed the list inline in a sentence (comma-separated). " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " + "Always include at least one substantive answer sentence before the score lines. " "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. " "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. " "Do not invent numbers or facts. " @@ -3801,6 +3802,24 @@ def _fast_fact_lines( return selected +def _has_body_lines(answer: str) -> bool: + lines = [line.strip() for line in (answer or "").splitlines() if line.strip()] + for line in lines: + lowered = line.lower() + if lowered.startswith("confidence"): + continue + if lowered.startswith("relevance"): + continue + if lowered.startswith("satisfaction"): + continue + if lowered.startswith("hallucinationrisk"): + continue + if lowered.startswith("hallucination risk"): + continue + return True + return False + + def _open_ended_fast_single( prompt: str, *, @@ -3818,6 +3837,15 @@ def _open_ended_fast_single( system_override=_open_ended_system(), model=model, ) + if not _has_body_lines(reply): + reply = _ollama_call( + ("atlasbot_fast", "atlasbot_fast"), + prompt + " Provide one clear sentence before the score lines.", + context=context, + use_history=False, + system_override=_open_ended_system(), + model=model, + ) if state: state.update("done", step=_open_ended_total_steps("fast")) return _ensure_scores(reply) From c8630ddae8231bfb8436802e972bfa7168983eb1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:20:28 -0300 Subject: [PATCH 405/416] atlasbot: fix tag detection for workload queries --- services/comms/scripts/atlasbot/bot.py | 29 ++++++++++++++------------ 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7d47423f..b73d3f3d 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3154,35 +3154,37 @@ def _open_ended_interpret( def _preferred_tags_for_prompt(prompt: str) -> set[str]: q = normalize_query(prompt) + tokens = set(_tokens(prompt)) tags: set[str] = set() - if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")): + if tokens & {"cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load"}: tags.add("utilization") - if any(word in q for word in ("postgres", "database", "db", "connections")): + if tokens & {"postgres", "database", "db", "connections"}: tags.add("database") - if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")): + if tokens & {"pod", "pods", "deployment", "job", "cronjob"}: tags.add("pods") - if any(word in q for word in ("workload", "service", "namespace")): + if tokens & {"workload", "service", "namespace"}: tags.add("workloads") - if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")): + if tokens & {"ready", "down", "unreachable", "availability"} or "not ready" in q: tags.add("availability") - if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")): + if tokens & {"node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane"}: tags.update({"hardware", "inventory", "architecture"}) return tags & _ALLOWED_INSIGHT_TAGS def _primary_tags_for_prompt(prompt: str) -> set[str]: q = normalize_query(prompt) - if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")): + tokens = set(_tokens(prompt)) + if tokens & {"cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load"}: return {"utilization"} - if any(word in q for word in ("postgres", "database", "db", "connections")): + if tokens & {"postgres", "database", "db", "connections"}: return {"database"} - if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")): + if tokens & {"pod", "pods", "deployment", "job", "cronjob"}: return {"pods"} - if any(word in q for word in ("workload", "service", "namespace")): + if tokens & {"workload", "service", "namespace"}: return {"workloads"} - if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")): + if tokens & {"ready", "down", "unreachable", "availability"} or "not ready" in q: return {"availability"} - if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")): + if tokens & {"node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane"}: return {"hardware", "inventory", "architecture"} return set() @@ -3202,9 +3204,10 @@ def _tags_from_text(text: str) -> set[str]: q = normalize_query(text) if not q: return set() + tokens = set(_tokens(text)) tags: set[str] = set() for tag, keywords in _TAG_KEYWORDS.items(): - if any(word in q for word in keywords): + if any(word in tokens for word in keywords): tags.add(tag) return tags & _ALLOWED_INSIGHT_TAGS From 5bc90929be23c42e8f045bc177ec28c16230b7a4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:23:54 -0300 Subject: [PATCH 406/416] comms: use 14b model for atlasbot quick --- services/comms/atlasbot-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index d570fd9a..6fbd3271 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-91 + checksum/atlasbot-configmap: manual-atlasbot-92 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -84,7 +84,7 @@ spec: - name: OLLAMA_MODEL value: qwen2.5:14b-instruct - name: ATLASBOT_MODEL_FAST - value: qwen2.5:7b-instruct-q4_0 + value: qwen2.5:14b-instruct-q4_0 - name: ATLASBOT_MODEL_DEEP value: qwen2.5:14b-instruct - name: OLLAMA_FALLBACK_MODEL From 48b46972abe1b185d912c9101c99d359d9ef046b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:29:21 -0300 Subject: [PATCH 407/416] atlasbot: add fact-pack fallback for fast --- services/comms/scripts/atlasbot/bot.py | 35 ++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index b73d3f3d..4fa67d40 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3823,6 +3823,37 @@ def _has_body_lines(answer: str) -> bool: return False +def _fallback_fact_answer(prompt: str, context: str) -> str: + facts: list[str] = [] + for line in (context or "").splitlines(): + trimmed = line.strip() + if not trimmed.startswith("F"): + continue + if ":" not in trimmed: + continue + fact = trimmed.split(":", 1)[1].strip() + if fact.startswith("-"): + fact = fact.lstrip("-").strip() + if fact: + facts.append(fact) + if not facts: + return "" + tokens = set(_tokens(prompt)) + best_fact = "" + best_score = -1 + for fact in facts: + score = len(tokens & set(_tokens(fact))) + if score > best_score: + best_score = score + best_fact = fact + if best_score <= 0: + return "" + sentence = f"Based on the snapshot, {best_fact}" + if not sentence.endswith((".", "!", "?")): + sentence += "." + return sentence + + def _open_ended_fast_single( prompt: str, *, @@ -3849,6 +3880,10 @@ def _open_ended_fast_single( system_override=_open_ended_system(), model=model, ) + if not _has_body_lines(reply): + fallback = _fallback_fact_answer(prompt, context) + if fallback: + reply = fallback if state: state.update("done", step=_open_ended_total_steps("fast")) return _ensure_scores(reply) From 9be79f07cd6d47e3661b0205d0e46894a807b868 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:32:17 -0300 Subject: [PATCH 408/416] atlasbot: prefer fact fallback for quantitative prompts --- services/comms/scripts/atlasbot/bot.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 4fa67d40..8806d2aa 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3854,6 +3854,18 @@ def _fallback_fact_answer(prompt: str, context: str) -> str: return sentence +def _is_quantitative_prompt(prompt: str) -> bool: + q = normalize_query(prompt) + if not q: + return False + tokens = set(_tokens(prompt)) + if "how many" in q or "count" in tokens or "total" in tokens: + return True + if tokens & {"highest", "lowest", "hottest", "most", "least"}: + return True + return False + + def _open_ended_fast_single( prompt: str, *, @@ -3880,10 +3892,9 @@ def _open_ended_fast_single( system_override=_open_ended_system(), model=model, ) - if not _has_body_lines(reply): - fallback = _fallback_fact_answer(prompt, context) - if fallback: - reply = fallback + fallback = _fallback_fact_answer(prompt, context) + if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)): + reply = fallback if state: state.update("done", step=_open_ended_total_steps("fast")) return _ensure_scores(reply) From 98dc7284e7a7c8b7574451916be2adb1638435c8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:35:02 -0300 Subject: [PATCH 409/416] atlasbot: fix fallback fact parsing --- services/comms/scripts/atlasbot/bot.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 8806d2aa..e0f84175 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3829,9 +3829,12 @@ def _fallback_fact_answer(prompt: str, context: str) -> str: trimmed = line.strip() if not trimmed.startswith("F"): continue - if ":" not in trimmed: + match = re.match(r"^F\\d+.*?\\]:\\s*(.*)$", trimmed) + if not match: + match = re.match(r"^F\\d+:\\s*(.*)$", trimmed) + if not match: continue - fact = trimmed.split(":", 1)[1].strip() + fact = match.group(1).strip() if fact.startswith("-"): fact = fact.lstrip("-").strip() if fact: From 55a05c757f94964cda6a07cda210ecbd27aaeeb6 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:46:06 -0300 Subject: [PATCH 410/416] atlasbot: refine fast fact selection and prompts --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 56 +++++++++++++++++++++---- 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 6fbd3271..f007942d 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-92 + checksum/atlasbot-configmap: manual-atlasbot-93 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index e0f84175..5ce19845 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -253,11 +253,13 @@ def normalize_query(text: str) -> str: cleaned = (text or "").lower() for ch in _DASH_CHARS: cleaned = cleaned.replace(ch, "-") + cleaned = cleaned.replace("_", " ") cleaned = re.sub(r"\s+", " ", cleaned).strip() return cleaned def _tokens(text: str) -> list[str]: - toks = [t.lower() for t in TOKEN_RE.findall(text or "")] + cleaned = re.sub(r"[\\_/]", " ", text or "") + toks = [t.lower() for t in TOKEN_RE.findall(cleaned)] return [t for t in toks if t not in STOPWORDS and len(t) >= 2] @@ -2730,6 +2732,18 @@ _ALLOWED_INSIGHT_TAGS = { _DYNAMIC_TAGS = {"availability", "database", "pods", "utilization", "workloads"} _INVENTORY_TAGS = {"hardware", "architecture", "inventory", "workers", "node_detail", "os"} +_SUBJECTIVE_TAG_PRIORITY = ( + "utilization", + "database", + "pods", + "workloads", + "availability", + "hardware", + "inventory", + "architecture", + "node_detail", + "os", +) def _fact_line_tags(line: str) -> set[str]: @@ -2922,7 +2936,8 @@ def _open_ended_system() -> str: "You may draw light inferences if you label them as such. " "Write concise, human sentences with a helpful, calm tone (not a list). " "Be willing to take a light stance; do not over-hedge. " - "If the question is subjective (cool/interesting/unconventional), pick a standout fact and explain why it stands out. " + "If the question is subjective (cool/interesting/unconventional), pick a standout fact, explain why it stands out, " + "and use 2-3 sentences. " "If the question asks for a list, embed the list inline in a sentence (comma-separated). " "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " @@ -3773,6 +3788,8 @@ def _fast_fact_lines( return [] primary_tags = primary_tags or set() scored: list[tuple[int, int, str]] = [] + priority_map = {tag: idx for idx, tag in enumerate(_SUBJECTIVE_TAG_PRIORITY)} + use_priority = not primary_tags and focus_tags == _ALLOWED_INSIGHT_TAGS for idx, line in enumerate(fact_lines): fid = f"F{idx + 1}" tags = set(fact_meta.get(fid, {}).get("tags") or []) @@ -3783,6 +3800,12 @@ def _fast_fact_lines( score += 4 * len(tags & primary_tags) if focus_tags: score += 2 * len(tags & focus_tags) + if use_priority and tags: + bonus = 0 + for tag in tags: + if tag in priority_map: + bonus = max(bonus, len(priority_map) - priority_map[tag]) + score += bonus scored.append((score, idx, line)) scored.sort(key=lambda item: (-item[0], item[1])) selected: list[str] = [] @@ -3845,13 +3868,27 @@ def _fallback_fact_answer(prompt: str, context: str) -> str: best_fact = "" best_score = -1 for fact in facts: - score = len(tokens & set(_tokens(fact))) + key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", fact) + if not key_match: + key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", fact) + key_tokens: set[str] = set() + if key_match: + key_tokens = set(_tokens(key_match.group(1))) + score = len(tokens & set(_tokens(fact))) + 2 * len(tokens & key_tokens) if score > best_score: best_score = score best_fact = fact if best_score <= 0: return "" - sentence = f"Based on the snapshot, {best_fact}" + key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", best_fact) + if not key_match: + key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", best_fact) + if key_match: + key = key_match.group(1).strip().replace("_", " ") + val = key_match.group(2).strip() + sentence = f"{key.capitalize()} is {val}" + else: + sentence = f"Based on the snapshot, {best_fact}" if not sentence.endswith((".", "!", "?")): sentence += "." return sentence @@ -3873,15 +3910,17 @@ def _open_ended_fast_single( prompt: str, *, context: str, + history_lines: list[str] | None = None, state: ThoughtState | None = None, model: str, ) -> str: if state: state.update("drafting", step=1, note="summarizing") + working_context = _append_history_context(context, history_lines or []) if history_lines else context reply = _ollama_call( ("atlasbot_fast", "atlasbot_fast"), prompt, - context=context, + context=working_context, use_history=False, system_override=_open_ended_system(), model=model, @@ -3890,7 +3929,7 @@ def _open_ended_fast_single( reply = _ollama_call( ("atlasbot_fast", "atlasbot_fast"), prompt + " Provide one clear sentence before the score lines.", - context=context, + context=working_context, use_history=False, system_override=_open_ended_system(), model=model, @@ -3933,6 +3972,7 @@ def _open_ended_fast( return _open_ended_fast_single( prompt, context=selected_pack, + history_lines=history_lines, state=state, model=model, ) @@ -4089,7 +4129,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler): cleaned_q = normalize_query(cleaned) cluster_affinity = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) subjective = _is_subjective_query(cleaned) - followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS) + followup_affinity = any(word in cleaned_q for word in METRIC_HINT_WORDS) contextual = history_cluster and (followup or followup_affinity) cluster_query = cluster_affinity or contextual context = "" @@ -4633,7 +4673,7 @@ def sync_loop(token: str, room_id: str): cleaned_q = normalize_query(cleaned_body) cluster_affinity = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) subjective = _is_subjective_query(cleaned_body) - followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS) + followup_affinity = any(word in cleaned_q for word in METRIC_HINT_WORDS) contextual = history_cluster and (followup or followup_affinity) cluster_query = cluster_affinity or contextual context = "" From d68252d51bfa6ce72d7c0ac3f723cf2139b94c81 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:51:37 -0300 Subject: [PATCH 411/416] atlasbot: expand fast context for quantitative prompts --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index f007942d..7856eed4 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-93 + checksum/atlasbot-configmap: manual-atlasbot-94 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 5ce19845..81212ff3 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3967,6 +3967,8 @@ def _open_ended_fast( ) selected_meta = _fact_pack_meta(selected_lines) selected_pack = _fact_pack_text(selected_lines, selected_meta) + if _is_quantitative_prompt(prompt) or not selected_lines: + selected_pack = fact_pack if state: state.total_steps = _open_ended_total_steps("fast") return _open_ended_fast_single( From 9144953519c5f2538976a8706e15ff9a0ac7d0fe Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 03:56:26 -0300 Subject: [PATCH 412/416] atlasbot: improve fast fallback and usage filtering --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 36 ++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7856eed4..79946181 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-94 + checksum/atlasbot-configmap: manual-atlasbot-95 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 81212ff3..357941bd 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -923,7 +923,7 @@ def _nodes_by_arch(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: grouped[(node.get("arch") or "unknown")].append(node["name"]) return {k: sorted(v) for k, v in grouped.items()} -def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]: +def _node_usage_table(metrics: dict[str, Any], *, allowed_nodes: set[str] | None = None) -> list[dict[str, Any]]: usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} per_node: dict[str, dict[str, Any]] = {} for metric_name, entries in usage.items() if isinstance(usage, dict) else []: @@ -935,6 +935,8 @@ def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]: node = entry.get("node") if not isinstance(node, str) or not node: continue + if allowed_nodes and node not in allowed_nodes: + continue per_node.setdefault(node, {})[metric_name] = entry.get("value") return [{"node": node, **vals} for node, vals in sorted(per_node.items())] @@ -1139,7 +1141,8 @@ def facts_context( if items: lines.append(f"- top_restarts_1h: {', '.join(items)}") - usage_table = _node_usage_table(metrics) + allowed_nodes = {node.get("name") for node in inv if isinstance(node, dict) and node.get("name")} + usage_table = _node_usage_table(metrics, allowed_nodes=allowed_nodes or None) if usage_table: lines.append("- node_usage (cpu/ram/net/io):") for entry in usage_table: @@ -3906,6 +3909,31 @@ def _is_quantitative_prompt(prompt: str) -> bool: return False +def _is_list_prompt(prompt: str) -> bool: + q = normalize_query(prompt) + if not q: + return False + if any(phrase in q for phrase in ("list", "names", "name", "show")): + return True + if any(phrase in q for phrase in ("which nodes", "what nodes", "what are the nodes")): + return True + return False + + +def _needs_full_fact_pack(prompt: str) -> bool: + q = normalize_query(prompt) + tokens = set(_tokens(prompt)) + if _is_quantitative_prompt(prompt) or _is_list_prompt(prompt): + return True + if tokens & {"workload", "pods", "namespace"}: + return True + if _NAME_INDEX and tokens & _NAME_INDEX: + return True + if any(phrase in q for phrase in ("where does", "where is", "where are", "running", "run on", "hosted on", "primary node")): + return True + return False + + def _open_ended_fast_single( prompt: str, *, @@ -3937,6 +3965,8 @@ def _open_ended_fast_single( fallback = _fallback_fact_answer(prompt, context) if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)): reply = fallback + if not _has_body_lines(reply): + reply = "I don't have enough data in the current snapshot to answer that." if state: state.update("done", step=_open_ended_total_steps("fast")) return _ensure_scores(reply) @@ -3967,7 +3997,7 @@ def _open_ended_fast( ) selected_meta = _fact_pack_meta(selected_lines) selected_pack = _fact_pack_text(selected_lines, selected_meta) - if _is_quantitative_prompt(prompt) or not selected_lines: + if _needs_full_fact_pack(prompt) or not selected_lines: selected_pack = fact_pack if state: state.total_steps = _open_ended_total_steps("fast") From 17d144dcb6e84bda9fa3e23d68c1ad05c46dbb6b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 04:00:13 -0300 Subject: [PATCH 413/416] atlasbot: clean fact labels and non-cluster confidence --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 79946181..58a55641 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-95 + checksum/atlasbot-configmap: manual-atlasbot-96 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 357941bd..59a8c2d4 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2945,6 +2945,7 @@ def _open_ended_system() -> str: "If the question is ambiguous, pick a reasonable interpretation and state it briefly. " "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. " "Always include at least one substantive answer sentence before the score lines. " + "Do not mention fact IDs or internal labels (e.g., F1/F2) in your response. " "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. " "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. " "Do not invent numbers or facts. " @@ -4091,6 +4092,7 @@ def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> s system_override=system, model=model, ) + reply = re.sub(r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*", "", reply, flags=re.IGNORECASE).strip() return _ensure_scores(reply) From 9d8b48fbf571052d0a9cd596aa030d1c77904910 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 04:06:24 -0300 Subject: [PATCH 414/416] atlasbot: expand full-pack triggers and strip inline confidence --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 58a55641..7001190a 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-96 + checksum/atlasbot-configmap: manual-atlasbot-97 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 59a8c2d4..6f3581f9 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -3030,7 +3030,14 @@ def _ensure_scores(answer: str) -> str: ): _record_score("hallucinationrisk", _extract_value(cleaned)) continue - body_lines.append(line) + cleaned_body = re.sub( + r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*", + "", + line, + flags=re.IGNORECASE, + ).strip() + if cleaned_body: + body_lines.append(cleaned_body) confidence = score_map.get("confidence") or "medium" relevance = score_map.get("relevance") or "70" @@ -3926,7 +3933,7 @@ def _needs_full_fact_pack(prompt: str) -> bool: tokens = set(_tokens(prompt)) if _is_quantitative_prompt(prompt) or _is_list_prompt(prompt): return True - if tokens & {"workload", "pods", "namespace"}: + if tokens & {"workload", "pods", "namespace", "worker", "workers"}: return True if _NAME_INDEX and tokens & _NAME_INDEX: return True From eb074d98583d322446adeb5b6539dac7d76e4c91 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 04:10:31 -0300 Subject: [PATCH 415/416] atlasbot: favor factual fallback in fast mode --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 7001190a..187cd6c1 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-97 + checksum/atlasbot-configmap: manual-atlasbot-98 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 6f3581f9..7fcc066f 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -2948,6 +2948,7 @@ def _open_ended_system() -> str: "Do not mention fact IDs or internal labels (e.g., F1/F2) in your response. " "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. " "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. " + "Do not convert counts into percentages or claim 100% unless a fact explicitly states a percentage. " "Do not invent numbers or facts. " "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)." ) @@ -4007,6 +4008,10 @@ def _open_ended_fast( selected_pack = _fact_pack_text(selected_lines, selected_meta) if _needs_full_fact_pack(prompt) or not selected_lines: selected_pack = fact_pack + if not subjective and _needs_full_fact_pack(prompt): + fallback = _fallback_fact_answer(prompt, fact_pack) + if fallback: + return _ensure_scores(fallback) if state: state.total_steps = _open_ended_total_steps("fast") return _open_ended_fast_single( From f75040bacacf2c79c3d938d4db08b23c65ae2fe1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 28 Jan 2026 11:02:10 -0300 Subject: [PATCH 416/416] atlasbot: improve fact parsing and fallback answers --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 227 ++++++++++++++++++++++-- 2 files changed, 210 insertions(+), 19 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 187cd6c1..b65aef08 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-98 + checksum/atlasbot-configmap: manual-atlasbot-101 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 7fcc066f..be256c0e 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -260,7 +260,24 @@ def normalize_query(text: str) -> str: def _tokens(text: str) -> list[str]: cleaned = re.sub(r"[\\_/]", " ", text or "") toks = [t.lower() for t in TOKEN_RE.findall(cleaned)] - return [t for t in toks if t not in STOPWORDS and len(t) >= 2] + expanded: list[str] = [] + synonyms = { + "network": "net", + "net": "network", + "memory": "ram", + "ram": "memory", + "i/o": "io", + } + for token in toks: + expanded.append(token) + if "-" in token: + expanded.extend(part for part in token.split("-") if part) + for token in list(expanded): + if token in synonyms: + expanded.append(synonyms[token]) + if token.endswith("s") and len(token) > 3: + expanded.append(token.rstrip("s")) + return [t for t in expanded if t not in STOPWORDS and len(t) >= 2] def _ensure_confidence(text: str) -> str: @@ -1077,10 +1094,16 @@ def facts_context( lines.append(f"- expected_workers_missing: {', '.join(missing)}") hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {} + usage_metrics = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {} for key in ("cpu", "ram", "net", "io"): entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {} node = entry.get("node") value = entry.get("value") + if not node or value is None: + usage = usage_metrics.get(key) if isinstance(usage_metrics.get(key), list) else [] + pick = _node_usage_top(usage, allowed_nodes=None) + if pick: + node, value = pick if node and value is not None: value_fmt = _format_metric_value( str(value), @@ -3001,6 +3024,7 @@ def _ensure_scores(answer: str) -> str: def _record_score(key: str, value: str): if not value: return + value = value.strip().rstrip("%") score_map.setdefault(key, value) for line in lines: @@ -3010,10 +3034,10 @@ def _ensure_scores(answer: str) -> str: "confidence" in lowered and "relevance" in lowered and "satisfaction" in lowered ): for key in ("confidence", "relevance", "satisfaction"): - match = re.search(rf"{key}\\s*[:=]?\\s*(\\d{{1,3}}|high|medium|low)", lowered) + match = re.search(rf"{key}\s*[:=]?\s*(\d{{1,3}}|high|medium|low)", lowered) if match: _record_score(key, match.group(1)) - risk_match = re.search(r"hallucination\\s*risk\\s*[:=]?\\s*(low|medium|high)", lowered) + risk_match = re.search(r"hallucination\s*risk\s*[:=]?\s*(low|medium|high)", lowered) if risk_match: _record_score("hallucinationrisk", risk_match.group(1)) continue @@ -3032,11 +3056,18 @@ def _ensure_scores(answer: str) -> str: _record_score("hallucinationrisk", _extract_value(cleaned)) continue cleaned_body = re.sub( - r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*", + r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", line, flags=re.IGNORECASE, ).strip() + cleaned_body = re.sub( + r"\bconfident\s*level\s*:\s*(high|medium|low)\b\.?\s*", + "", + cleaned_body, + flags=re.IGNORECASE, + ).strip() + cleaned_body = re.sub(r"\bF\d+\b", "", cleaned_body).strip() if cleaned_body: body_lines.append(cleaned_body) @@ -3860,41 +3891,195 @@ def _has_body_lines(answer: str) -> bool: def _fallback_fact_answer(prompt: str, context: str) -> str: facts: list[str] = [] + parsed_facts: list[tuple[str, str | None, str | None]] = [] + q = normalize_query(prompt) + tokens = set(_tokens(prompt)) for line in (context or "").splitlines(): trimmed = line.strip() - if not trimmed.startswith("F"): + if not trimmed: continue - match = re.match(r"^F\\d+.*?\\]:\\s*(.*)$", trimmed) - if not match: - match = re.match(r"^F\\d+:\\s*(.*)$", trimmed) - if not match: - continue - fact = match.group(1).strip() + if trimmed.startswith("F"): + match = re.match(r"^F\d+.*?\]:\s*(.*)$", trimmed) + if not match: + match = re.match(r"^F\d+:\s*(.*)$", trimmed) + if not match: + continue + fact = match.group(1).strip() + else: + if trimmed.lower().startswith("fact pack") or trimmed.lower().startswith("facts"): + continue + if trimmed.startswith("-"): + fact = trimmed.lstrip("-").strip() + else: + fact = trimmed if fact.startswith("-"): fact = fact.lstrip("-").strip() - if fact: + if fact and (":" in fact or "=" in fact): facts.append(fact) + key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", fact) + if not key_match: + key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", fact) + if key_match: + parsed_facts.append((fact, key_match.group(1).strip(), key_match.group(2).strip())) + else: + parsed_facts.append((fact, None, None)) if not facts: return "" - tokens = set(_tokens(prompt)) + + def _norm_key(text: str) -> str: + return normalize_query(text).replace(" ", "_") + + def _find_value(target: str) -> str | None: + for _fact, key, val in parsed_facts: + if key and _norm_key(key) == target: + return val + return None + + def _parse_counts(text: str) -> dict[str, int]: + counts: dict[str, int] = {} + for part in (text or "").split(","): + if "=" not in part: + continue + k, v = part.split("=", 1) + k = k.strip() + v = v.strip() + if not k or not v: + continue + try: + counts[k] = int(float(v)) + except ValueError: + continue + return counts + + def _parse_map(text: str) -> dict[str, str]: + mapping: dict[str, str] = {} + pattern = re.compile(r"(\w+)\s*=\s*([^=]+?)(?=(?:\s*,\s*\w+\s*=)|$)") + for match in pattern.finditer(text or ""): + mapping[match.group(1).strip()] = match.group(2).strip().strip(",") + return mapping + + list_intent = _is_list_prompt(prompt) or "name" in tokens + count_intent = _is_quantitative_prompt(prompt) and ("how many" in q or "count" in tokens or "number" in tokens) + hottest_intent = any(word in q for word in ("hottest", "highest", "most", "top", "busiest")) + metric = _detect_metric(q) + include_hw, _exclude_hw = _detect_hardware_filters(q) + + if hottest_intent and metric in {"cpu", "ram", "net", "io"}: + hottest_val = _find_value(f"hottest_{metric}") + if hottest_val: + return f"Hottest {metric} is {hottest_val}." + if hottest_intent and tokens & {"postgres", "database", "db", "connections"}: + hottest_db = _find_value("postgres_hottest_db") + if hottest_db: + return f"Hottest database is {hottest_db}." + + if count_intent and tokens & {"pods", "pod"}: + pending = _find_value("pods_pending") + failed = _find_value("pods_failed") + running = _find_value("pods_running") + succeeded = _find_value("pods_succeeded") + if "pending" in q and "failed" in q: + try: + total = float(pending or 0) + float(failed or 0) + return f"Pods pending or failed: {total:.0f}." + except ValueError: + pass + if "pending" in q and pending is not None: + return f"Pods pending is {pending}." + if "failed" in q and failed is not None: + return f"Pods failed is {failed}." + if "succeeded" in q and succeeded is not None: + return f"Pods succeeded is {succeeded}." + if "running" in q and running is not None: + return f"Pods running is {running}." + + if count_intent and tokens & {"nodes", "node"} and "not ready" in q: + nodes_total = _find_value("nodes_total") + if nodes_total and "not_ready" in nodes_total: + match = re.search(r"not_ready=([0-9.]+)", nodes_total) + if match: + return f"Not ready nodes: {match.group(1)}." + + if count_intent and include_hw: + counts_line = _find_value("nodes_by_hardware_count") + if counts_line: + counts = _parse_counts(counts_line) + for hw in include_hw: + if hw in counts: + return f"{hw} nodes: {counts[hw]}." + for hw in include_hw: + hw_line = _find_value(hw) + if hw_line: + items = [item.strip() for item in hw_line.split(",") if item.strip()] + return f"{hw} nodes: {len(items)}." + + if list_intent and include_hw: + if "control" in q: + cp_by_hw = _find_value("control_plane_by_hardware") + if cp_by_hw: + mapping = _parse_map(cp_by_hw) + for hw in include_hw: + if hw in mapping: + return f"{hw} control-plane nodes: {mapping[hw]}." + cp_nodes = _find_value("control_plane_nodes") + if cp_nodes: + return f"Control-plane nodes: {cp_nodes}." + for hw in include_hw: + hw_line = _find_value(hw) + if hw_line: + return f"{hw} nodes: {hw_line}." + + if list_intent and "control" in q: + cp_nodes = _find_value("control_plane_nodes") + if cp_nodes: + return f"Control-plane nodes: {cp_nodes}." + + preferred = tokens & { + "node", + "nodes", + "pod", + "pods", + "postgres", + "db", + "database", + "namespace", + "workload", + "worker", + "workers", + "cpu", + "ram", + "memory", + "net", + "network", + "io", + "disk", + "connection", + "connections", + } best_fact = "" best_score = -1 for fact in facts: - key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", fact) + key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", fact) if not key_match: - key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", fact) + key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", fact) key_tokens: set[str] = set() if key_match: key_tokens = set(_tokens(key_match.group(1))) score = len(tokens & set(_tokens(fact))) + 2 * len(tokens & key_tokens) + if preferred: + score += 3 * len(preferred & key_tokens) + if not (preferred & key_tokens): + score -= 1 + if list_intent and key_match and "count" in key_tokens: + score -= 3 if score > best_score: best_score = score best_fact = fact if best_score <= 0: return "" - key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", best_fact) + key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", best_fact) if not key_match: - key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", best_fact) + key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", best_fact) if key_match: key = key_match.group(1).strip().replace("_", " ") val = key_match.group(2).strip() @@ -3936,6 +4121,10 @@ def _needs_full_fact_pack(prompt: str) -> bool: return True if tokens & {"workload", "pods", "namespace", "worker", "workers"}: return True + if tokens & {"arch", "architecture", "hardware"}: + return True + if tokens & METRIC_HINT_WORDS: + return True if _NAME_INDEX and tokens & _NAME_INDEX: return True if any(phrase in q for phrase in ("where does", "where is", "where are", "running", "run on", "hosted on", "primary node")): @@ -4104,7 +4293,7 @@ def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> s system_override=system, model=model, ) - reply = re.sub(r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*", "", reply, flags=re.IGNORECASE).strip() + reply = re.sub(r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", reply, flags=re.IGNORECASE).strip() return _ensure_scores(reply) @@ -4405,6 +4594,8 @@ def _is_cluster_query( return True if any(word in q for word in CLUSTER_HINT_WORDS): return True + if any(word in q for word in METRIC_HINT_WORDS): + return True for host_match in HOST_RE.finditer(q): host = host_match.group(1).lower() if host.endswith("bstein.dev"):