metis: add sentinel watch task

This commit is contained in:
Brad Stein 2026-03-31 14:07:02 -03:00
parent 44336f1272
commit cf0271a8ea
7 changed files with 310 additions and 3 deletions

View File

@ -25,6 +25,7 @@ from .services.mailu import mailu
from .services.mailu_events import mailu_events
from .services.nextcloud import nextcloud
from .services.image_sweeper import image_sweeper
from .services.metis import metis
from .services.opensearch_prune import prune_indices
from .services.pod_cleaner import clean_finished_pods
from .services.vaultwarden_sync import run_vaultwarden_sync
@ -309,6 +310,11 @@ def _startup() -> None:
settings.image_sweeper_cron,
lambda: image_sweeper.run(wait=True),
)
scheduler.add_task(
"schedule.metis_sentinel_watch",
settings.metis_sentinel_watch_cron,
lambda: metis.watch_sentinel(),
)
scheduler.add_task(
"schedule.vault_k8s_auth",
settings.vault_k8s_auth_cron,
@ -361,6 +367,7 @@ def _startup() -> None:
"pod_cleaner_cron": settings.pod_cleaner_cron,
"opensearch_prune_cron": settings.opensearch_prune_cron,
"image_sweeper_cron": settings.image_sweeper_cron,
"metis_sentinel_watch_cron": settings.metis_sentinel_watch_cron,
"vault_k8s_auth_cron": settings.vault_k8s_auth_cron,
"vault_oidc_cron": settings.vault_oidc_cron,
"comms_guest_name_cron": settings.comms_guest_name_cron,

189
ariadne/services/metis.py Normal file
View File

@ -0,0 +1,189 @@
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime, timezone
import json
import time
from pathlib import Path
from typing import Any
from ..settings import settings
from ..utils.logging import get_logger
logger = get_logger(__name__)
_SNAPSHOT_KEYS = ("hostname", "kernel", "os_image", "k3s_version", "containerd", "package_sample")
_TIMESTAMP_KEYS = ("collected_at", "timestamp", "generated_at", "created_at")
@dataclass(frozen=True)
class MetisSentinelWatchSummary:
status: str
source: str
snapshots: int
hosts: int
hostnames: list[str] = field(default_factory=list)
latest_snapshot_at: str = ""
latest_snapshot_age_sec: float | None = None
detail: str = ""
def _parse_timestamp(raw: Any) -> datetime | None:
if not isinstance(raw, str):
return None
text = raw.strip()
if not text:
return None
try:
return datetime.fromisoformat(text.replace("Z", "+00:00"))
except ValueError:
return None
def _normalize_snapshots(payload: Any) -> list[dict[str, Any]]:
if isinstance(payload, list):
return [item for item in payload if isinstance(item, dict)]
if not isinstance(payload, dict):
return []
for key in ("snapshots", "items", "data"):
value = payload.get(key)
if isinstance(value, list):
return [item for item in value if isinstance(item, dict)]
if any(key in payload for key in _SNAPSHOT_KEYS):
return [payload]
return []
def _snapshot_hostname(snapshot: dict[str, Any], fallback: str) -> str:
for key in ("hostname", "host", "name"):
value = snapshot.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
return fallback
def _snapshot_timestamp(snapshot: dict[str, Any], fallback_mtime: float) -> datetime:
for key in _TIMESTAMP_KEYS:
ts = _parse_timestamp(snapshot.get(key))
if ts is not None:
return ts.astimezone(timezone.utc)
return datetime.fromtimestamp(fallback_mtime, tz=timezone.utc)
class MetisService:
def ready(self) -> bool:
return bool(settings.metis_sentinel_dir)
def _finish(
self,
status: str,
source: str,
snapshots: list[dict[str, Any]],
detail: str,
latest_ts: datetime | None = None,
) -> MetisSentinelWatchSummary:
hostnames = sorted(
{
hostname
for idx, snapshot in enumerate(snapshots)
if (hostname := _snapshot_hostname(snapshot, f"snapshot-{idx + 1}"))
}
)
summary = MetisSentinelWatchSummary(
status=status,
source=source,
snapshots=len(snapshots),
hosts=len(hostnames),
hostnames=hostnames,
latest_snapshot_at=latest_ts.isoformat() if latest_ts is not None else "",
latest_snapshot_age_sec=(
max(0.0, (datetime.now(timezone.utc) - latest_ts).total_seconds()) if latest_ts is not None else None
),
detail=detail,
)
logger.info(
"metis sentinel watch finished",
extra={
"event": "metis_sentinel_watch",
"status": summary.status,
"source": summary.source,
"snapshots": summary.snapshots,
"hosts": summary.hosts,
"detail": summary.detail,
},
)
return summary
def watch_sentinel(self) -> MetisSentinelWatchSummary:
if not settings.metis_sentinel_dir:
return self._finish("skipped", "", [], "metis sentinel dir not configured")
source = Path(settings.metis_sentinel_dir)
if not source.exists():
return self._finish("error", str(source), [], "metis sentinel dir does not exist")
if not source.is_dir():
return self._finish("error", str(source), [], "metis sentinel path is not a directory")
snapshots: list[dict[str, Any]] = []
latest_ts: datetime | None = None
detail_parts: list[str] = []
newest_mtime = 0.0
files = sorted(path for path in source.rglob("*.json") if path.is_file())
if not files:
return self._finish("error", str(source), [], "metis sentinel dir does not contain snapshots")
for file_path in files:
try:
payload = json.loads(file_path.read_text())
except Exception as exc: # noqa: BLE001
detail_parts.append(f"{file_path.name}: {exc}")
continue
normalized = _normalize_snapshots(payload)
if not normalized:
detail_parts.append(f"{file_path.name}: empty snapshot payload")
continue
snapshots.extend(normalized)
try:
mtime = file_path.stat().st_mtime
except OSError:
mtime = time.time()
newest_mtime = max(newest_mtime, mtime)
for snapshot in normalized:
ts = _snapshot_timestamp(snapshot, mtime)
if latest_ts is None or ts > latest_ts:
latest_ts = ts
if not snapshots:
detail = "; ".join(detail_parts) if detail_parts else "metis sentinel snapshots were empty"
return self._finish("error", str(source), [], detail)
if newest_mtime > 0.0:
latest_file_ts = datetime.fromtimestamp(newest_mtime, tz=timezone.utc)
if latest_ts is None or latest_file_ts > latest_ts:
latest_ts = latest_file_ts
detail = f"loaded {len(files)} file(s) and {len(snapshots)} snapshot(s)"
if detail_parts:
detail = f"{detail}; {'; '.join(detail_parts)}"
status = "ok"
if settings.metis_sentinel_stale_after_sec > 0 and latest_ts is not None:
age_sec = max(0.0, (datetime.now(timezone.utc) - latest_ts).total_seconds())
if age_sec > settings.metis_sentinel_stale_after_sec:
status = "error"
detail = (
f"latest sentinel snapshot is stale by {round(age_sec, 1)}s "
f"(limit {settings.metis_sentinel_stale_after_sec:.0f}s)"
)
if detail_parts:
detail = f"{detail}; {'; '.join(detail_parts)}"
if detail_parts and status == "ok":
status = "error"
return self._finish(status, str(source), snapshots, detail, latest_ts=latest_ts)
metis = MetisService()

View File

@ -2,6 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass
import os
from typing import Any
def _env(name: str, default: str = "") -> str:
@ -212,6 +213,9 @@ class Settings:
keycloak_profile_cron: str
cluster_state_cron: str
cluster_state_keep: int
metis_sentinel_dir: str
metis_sentinel_stale_after_sec: float
metis_sentinel_watch_cron: str
opensearch_url: str
opensearch_limit_bytes: int
@ -475,6 +479,14 @@ class Settings:
"cluster_state_keep": _env_int("ARIADNE_CLUSTER_STATE_KEEP", 168),
}
@classmethod
def _metis_config(cls) -> dict[str, Any]:
return {
"metis_sentinel_dir": _env("METIS_SENTINEL_DIR", ""),
"metis_sentinel_stale_after_sec": _env_float("METIS_SENTINEL_STALE_AFTER_SEC", 3600.0),
"metis_sentinel_watch_cron": _env("ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH", "*/15 * * * *"),
}
@classmethod
def _opensearch_config(cls) -> dict[str, Any]:
return {
@ -502,6 +514,7 @@ class Settings:
vaultwarden_cfg = cls._vaultwarden_config()
schedule_cfg = cls._schedule_config()
cluster_cfg = cls._cluster_state_config()
metis_cfg = cls._metis_config()
opensearch_cfg = cls._opensearch_config()
portal_db = _env("PORTAL_DATABASE_URL", "")
@ -540,6 +553,7 @@ class Settings:
**vaultwarden_cfg,
**schedule_cfg,
**cluster_cfg,
**metis_cfg,
**opensearch_cfg,
)

6
tests/conftest.py Normal file
View File

@ -0,0 +1,6 @@
from __future__ import annotations
import os
os.environ["PORTAL_DATABASE_URL"] = "postgresql://user:pass@localhost/db"

View File

@ -2,13 +2,10 @@ from __future__ import annotations
import dataclasses
from datetime import datetime, timezone
import os
from fastapi import HTTPException
from fastapi.testclient import TestClient
os.environ.setdefault("PORTAL_DATABASE_URL", "postgresql://user:pass@localhost/db")
from ariadne.auth.keycloak import AuthContext
import ariadne.app as app_module
@ -47,6 +44,26 @@ def test_startup_and_shutdown(monkeypatch) -> None:
app_module._shutdown()
def test_startup_registers_metis_watch(monkeypatch) -> None:
tasks = []
monkeypatch.setattr(app_module.provisioning, "start", lambda: None)
monkeypatch.setattr(app_module.scheduler, "start", lambda: None)
monkeypatch.setattr(app_module.scheduler, "stop", lambda: None)
monkeypatch.setattr(app_module.provisioning, "stop", lambda: None)
monkeypatch.setattr(app_module.portal_db, "close", lambda: None)
monkeypatch.setattr(app_module.ariadne_db, "close", lambda: None)
monkeypatch.setattr(
app_module.scheduler,
"add_task",
lambda name, cron_expr, runner: tasks.append((name, cron_expr)),
)
app_module._startup()
assert any(name == "schedule.metis_sentinel_watch" for name, _cron in tasks)
def test_record_event_handles_exception(monkeypatch) -> None:
monkeypatch.setattr(app_module.storage, "record_event", lambda *args, **kwargs: (_ for _ in ()).throw(RuntimeError("fail")))
app_module._record_event("event", {"ok": True})

62
tests/test_metis.py Normal file
View File

@ -0,0 +1,62 @@
from __future__ import annotations
import json
from pathlib import Path
import types
from ariadne.services.metis import MetisService
def test_watch_sentinel_reads_snapshot_dir(monkeypatch, tmp_path) -> None:
monkeypatch.setattr(
"ariadne.services.metis.settings",
types.SimpleNamespace(
metis_sentinel_dir=str(tmp_path),
metis_sentinel_stale_after_sec=3600.0,
),
)
Path(tmp_path, "node-a.json").write_text(
json.dumps(
{
"hostname": "titan-13",
"kernel": "6.6.63",
"containerd": "1.7.23",
}
),
encoding="utf-8",
)
Path(tmp_path, "node-b.json").write_text(
json.dumps(
{
"hostname": "titan-19",
"kernel": "6.6.63",
"containerd": "1.7.23",
}
),
encoding="utf-8",
)
summary = MetisService().watch_sentinel()
assert summary.status == "ok"
assert summary.snapshots == 2
assert summary.hosts == 2
assert summary.hostnames == ["titan-13", "titan-19"]
assert summary.source == str(tmp_path)
def test_watch_sentinel_skips_when_unconfigured(monkeypatch) -> None:
monkeypatch.setattr(
"ariadne.services.metis.settings",
types.SimpleNamespace(
metis_sentinel_dir="",
metis_sentinel_stale_after_sec=3600.0,
),
)
summary = MetisService().watch_sentinel()
assert summary.status == "skipped"
assert summary.snapshots == 0
assert summary.hosts == 0

View File

@ -1,6 +1,7 @@
from __future__ import annotations
from ariadne import settings as settings_module
from ariadne.settings import Settings
def test_env_int_invalid(monkeypatch) -> None:
@ -11,3 +12,14 @@ def test_env_int_invalid(monkeypatch) -> None:
def test_env_float_invalid(monkeypatch) -> None:
monkeypatch.setenv("ARIADNE_FLOAT_TEST", "bad")
assert settings_module._env_float("ARIADNE_FLOAT_TEST", 1.5) == 1.5
def test_from_env_includes_metis_settings(monkeypatch) -> None:
monkeypatch.setenv("METIS_SENTINEL_DIR", "/var/lib/metis/sentinel")
monkeypatch.setenv("METIS_SENTINEL_STALE_AFTER_SEC", "900")
monkeypatch.setenv("ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH", "*/7 * * * *")
cfg = Settings.from_env()
assert cfg.metis_sentinel_dir == "/var/lib/metis/sentinel"
assert cfg.metis_sentinel_stale_after_sec == 900.0
assert cfg.metis_sentinel_watch_cron == "*/7 * * * *"