knowledge: relocate metis doc; monitoring: add cpu high alert
This commit is contained in:
parent
cac8506929
commit
c13b161171
2
.gitignore
vendored
2
.gitignore
vendored
@ -4,3 +4,5 @@
|
|||||||
!services/comms/knowledge/**/*.md
|
!services/comms/knowledge/**/*.md
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
.pytest_cache
|
||||||
|
.venv
|
||||||
|
|||||||
26
knowledge/software/metis.md
Normal file
26
knowledge/software/metis.md
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# Metis (node recovery)
|
||||||
|
|
||||||
|
## Node classes (current map)
|
||||||
|
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
|
||||||
|
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
|
||||||
|
- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
|
||||||
|
- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
|
||||||
|
- amd64 agents: titan-22/24 (Debian 13, k3s agent)
|
||||||
|
- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers.
|
||||||
|
|
||||||
|
## Longhorn disk UUIDs (critical nodes)
|
||||||
|
- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
|
||||||
|
- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
|
||||||
|
- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
|
||||||
|
- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
|
||||||
|
|
||||||
|
## Metis repo (~/Development/metis)
|
||||||
|
- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
|
||||||
|
- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
|
||||||
|
- `AGENTS.md` in repo is untracked and holds raw notes.
|
||||||
|
|
||||||
|
## Next implementation steps
|
||||||
|
- Add per-class golden image refs and checksums (Harbor or file://) when ready.
|
||||||
|
- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
|
||||||
|
- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
|
||||||
|
- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
|
||||||
@ -125,6 +125,58 @@ data:
|
|||||||
summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour"
|
summary: "{{ $labels.instance }} rootfs grew >1Gi in the last hour"
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
- orgId: 1
|
||||||
|
name: atlas-cpu
|
||||||
|
folder: Alerts
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: cpu-high-10m
|
||||||
|
title: "Node CPU high (>90% for 10m)"
|
||||||
|
condition: C
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: atlas-vm
|
||||||
|
model:
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m]
|
||||||
|
legendFormat: '{{instance}}'
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: atlas-vm
|
||||||
|
- refId: B
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: A
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: B
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
type: threshold
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [90]
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
noDataState: NoData
|
||||||
|
execErrState: Error
|
||||||
|
annotations:
|
||||||
|
summary: "{{ $labels.instance }} CPU >90% for 10m"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: maintenance
|
name: maintenance
|
||||||
folder: Alerts
|
folder: Alerts
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user