diff --git a/.gitignore b/.gitignore index 2fcd3f0..8e09aa9 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ !services/comms/knowledge/**/*.md __pycache__/ *.py[cod] +.pytest_cache +.venv diff --git a/knowledge/software/metis.md b/knowledge/software/metis.md new file mode 100644 index 0000000..5b0d06b --- /dev/null +++ b/knowledge/software/metis.md @@ -0,0 +1,26 @@ +# Metis (node recovery) + +## Node classes (current map) +- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent) +- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint) +- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks) +- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent) +- amd64 agents: titan-22/24 (Debian 13, k3s agent) +- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers. + +## Longhorn disk UUIDs (critical nodes) +- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4) +- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4) +- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4) +- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4) + +## Metis repo (~/Development/metis) +- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`). +- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints). +- `AGENTS.md` in repo is untracked and holds raw notes. + +## Next implementation steps +- Add per-class golden image refs and checksums (Harbor or file://) when ready. +- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths. +- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection. +- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited. diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index 474f490..7800d8d 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -125,6 +125,58 @@ data: summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour" labels: severity: warning + - orgId: 1 + name: atlas-cpu + folder: Alerts + interval: 1m + rules: + - uid: cpu-high-10m + title: "Node CPU high (>90% for 10m)" + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] + legendFormat: '{{instance}}' + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [90] + type: gt + operator: + type: and + reducer: + type: last + type: query + noDataState: NoData + execErrState: Error + annotations: + summary: "{{ $labels.instance }} CPU >90% for 10m" + labels: + severity: warning - orgId: 1 name: maintenance folder: Alerts