diff --git a/ariadne/app.py b/ariadne/app.py index eed4701..d5665d7 100644 --- a/ariadne/app.py +++ b/ariadne/app.py @@ -25,6 +25,7 @@ from .services.mailu import mailu from .services.mailu_events import mailu_events from .services.nextcloud import nextcloud from .services.image_sweeper import image_sweeper +from .services.metis import metis from .services.opensearch_prune import prune_indices from .services.pod_cleaner import clean_finished_pods from .services.vaultwarden_sync import run_vaultwarden_sync @@ -309,6 +310,11 @@ def _startup() -> None: settings.image_sweeper_cron, lambda: image_sweeper.run(wait=True), ) + scheduler.add_task( + "schedule.metis_sentinel_watch", + settings.metis_sentinel_watch_cron, + lambda: metis.watch_sentinel(), + ) scheduler.add_task( "schedule.vault_k8s_auth", settings.vault_k8s_auth_cron, @@ -361,6 +367,7 @@ def _startup() -> None: "pod_cleaner_cron": settings.pod_cleaner_cron, "opensearch_prune_cron": settings.opensearch_prune_cron, "image_sweeper_cron": settings.image_sweeper_cron, + "metis_sentinel_watch_cron": settings.metis_sentinel_watch_cron, "vault_k8s_auth_cron": settings.vault_k8s_auth_cron, "vault_oidc_cron": settings.vault_oidc_cron, "comms_guest_name_cron": settings.comms_guest_name_cron, diff --git a/ariadne/services/metis.py b/ariadne/services/metis.py new file mode 100644 index 0000000..f61ec2b --- /dev/null +++ b/ariadne/services/metis.py @@ -0,0 +1,189 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime, timezone +import json +import time +from pathlib import Path +from typing import Any + +from ..settings import settings +from ..utils.logging import get_logger + + +logger = get_logger(__name__) + +_SNAPSHOT_KEYS = ("hostname", "kernel", "os_image", "k3s_version", "containerd", "package_sample") +_TIMESTAMP_KEYS = ("collected_at", "timestamp", "generated_at", "created_at") + + +@dataclass(frozen=True) +class MetisSentinelWatchSummary: + status: str + source: str + snapshots: int + hosts: int + hostnames: list[str] = field(default_factory=list) + latest_snapshot_at: str = "" + latest_snapshot_age_sec: float | None = None + detail: str = "" + + +def _parse_timestamp(raw: Any) -> datetime | None: + if not isinstance(raw, str): + return None + text = raw.strip() + if not text: + return None + try: + return datetime.fromisoformat(text.replace("Z", "+00:00")) + except ValueError: + return None + + +def _normalize_snapshots(payload: Any) -> list[dict[str, Any]]: + if isinstance(payload, list): + return [item for item in payload if isinstance(item, dict)] + if not isinstance(payload, dict): + return [] + for key in ("snapshots", "items", "data"): + value = payload.get(key) + if isinstance(value, list): + return [item for item in value if isinstance(item, dict)] + if any(key in payload for key in _SNAPSHOT_KEYS): + return [payload] + return [] + + +def _snapshot_hostname(snapshot: dict[str, Any], fallback: str) -> str: + for key in ("hostname", "host", "name"): + value = snapshot.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + return fallback + + +def _snapshot_timestamp(snapshot: dict[str, Any], fallback_mtime: float) -> datetime: + for key in _TIMESTAMP_KEYS: + ts = _parse_timestamp(snapshot.get(key)) + if ts is not None: + return ts.astimezone(timezone.utc) + return datetime.fromtimestamp(fallback_mtime, tz=timezone.utc) + + +class MetisService: + def ready(self) -> bool: + return bool(settings.metis_sentinel_dir) + + def _finish( + self, + status: str, + source: str, + snapshots: list[dict[str, Any]], + detail: str, + latest_ts: datetime | None = None, + ) -> MetisSentinelWatchSummary: + hostnames = sorted( + { + hostname + for idx, snapshot in enumerate(snapshots) + if (hostname := _snapshot_hostname(snapshot, f"snapshot-{idx + 1}")) + } + ) + summary = MetisSentinelWatchSummary( + status=status, + source=source, + snapshots=len(snapshots), + hosts=len(hostnames), + hostnames=hostnames, + latest_snapshot_at=latest_ts.isoformat() if latest_ts is not None else "", + latest_snapshot_age_sec=( + max(0.0, (datetime.now(timezone.utc) - latest_ts).total_seconds()) if latest_ts is not None else None + ), + detail=detail, + ) + logger.info( + "metis sentinel watch finished", + extra={ + "event": "metis_sentinel_watch", + "status": summary.status, + "source": summary.source, + "snapshots": summary.snapshots, + "hosts": summary.hosts, + "detail": summary.detail, + }, + ) + return summary + + def watch_sentinel(self) -> MetisSentinelWatchSummary: + if not settings.metis_sentinel_dir: + return self._finish("skipped", "", [], "metis sentinel dir not configured") + + source = Path(settings.metis_sentinel_dir) + if not source.exists(): + return self._finish("error", str(source), [], "metis sentinel dir does not exist") + if not source.is_dir(): + return self._finish("error", str(source), [], "metis sentinel path is not a directory") + + snapshots: list[dict[str, Any]] = [] + latest_ts: datetime | None = None + detail_parts: list[str] = [] + newest_mtime = 0.0 + + files = sorted(path for path in source.rglob("*.json") if path.is_file()) + if not files: + return self._finish("error", str(source), [], "metis sentinel dir does not contain snapshots") + + for file_path in files: + try: + payload = json.loads(file_path.read_text()) + except Exception as exc: # noqa: BLE001 + detail_parts.append(f"{file_path.name}: {exc}") + continue + normalized = _normalize_snapshots(payload) + if not normalized: + detail_parts.append(f"{file_path.name}: empty snapshot payload") + continue + snapshots.extend(normalized) + try: + mtime = file_path.stat().st_mtime + except OSError: + mtime = time.time() + newest_mtime = max(newest_mtime, mtime) + for snapshot in normalized: + ts = _snapshot_timestamp(snapshot, mtime) + if latest_ts is None or ts > latest_ts: + latest_ts = ts + + if not snapshots: + detail = "; ".join(detail_parts) if detail_parts else "metis sentinel snapshots were empty" + return self._finish("error", str(source), [], detail) + + if newest_mtime > 0.0: + latest_file_ts = datetime.fromtimestamp(newest_mtime, tz=timezone.utc) + if latest_ts is None or latest_file_ts > latest_ts: + latest_ts = latest_file_ts + + detail = f"loaded {len(files)} file(s) and {len(snapshots)} snapshot(s)" + if detail_parts: + detail = f"{detail}; {'; '.join(detail_parts)}" + + status = "ok" + if settings.metis_sentinel_stale_after_sec > 0 and latest_ts is not None: + age_sec = max(0.0, (datetime.now(timezone.utc) - latest_ts).total_seconds()) + if age_sec > settings.metis_sentinel_stale_after_sec: + status = "error" + detail = ( + f"latest sentinel snapshot is stale by {round(age_sec, 1)}s " + f"(limit {settings.metis_sentinel_stale_after_sec:.0f}s)" + ) + if detail_parts: + detail = f"{detail}; {'; '.join(detail_parts)}" + + if detail_parts and status == "ok": + status = "error" + + return self._finish(status, str(source), snapshots, detail, latest_ts=latest_ts) + + +metis = MetisService() diff --git a/ariadne/settings.py b/ariadne/settings.py index aef518e..972936e 100644 --- a/ariadne/settings.py +++ b/ariadne/settings.py @@ -2,6 +2,7 @@ from __future__ import annotations from dataclasses import dataclass import os +from typing import Any def _env(name: str, default: str = "") -> str: @@ -212,6 +213,9 @@ class Settings: keycloak_profile_cron: str cluster_state_cron: str cluster_state_keep: int + metis_sentinel_dir: str + metis_sentinel_stale_after_sec: float + metis_sentinel_watch_cron: str opensearch_url: str opensearch_limit_bytes: int @@ -475,6 +479,14 @@ class Settings: "cluster_state_keep": _env_int("ARIADNE_CLUSTER_STATE_KEEP", 168), } + @classmethod + def _metis_config(cls) -> dict[str, Any]: + return { + "metis_sentinel_dir": _env("METIS_SENTINEL_DIR", ""), + "metis_sentinel_stale_after_sec": _env_float("METIS_SENTINEL_STALE_AFTER_SEC", 3600.0), + "metis_sentinel_watch_cron": _env("ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH", "*/15 * * * *"), + } + @classmethod def _opensearch_config(cls) -> dict[str, Any]: return { @@ -502,6 +514,7 @@ class Settings: vaultwarden_cfg = cls._vaultwarden_config() schedule_cfg = cls._schedule_config() cluster_cfg = cls._cluster_state_config() + metis_cfg = cls._metis_config() opensearch_cfg = cls._opensearch_config() portal_db = _env("PORTAL_DATABASE_URL", "") @@ -540,6 +553,7 @@ class Settings: **vaultwarden_cfg, **schedule_cfg, **cluster_cfg, + **metis_cfg, **opensearch_cfg, ) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..b7b8985 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,6 @@ +from __future__ import annotations + +import os + + +os.environ["PORTAL_DATABASE_URL"] = "postgresql://user:pass@localhost/db" diff --git a/tests/test_app.py b/tests/test_app.py index 598e048..1c52ff7 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -2,13 +2,10 @@ from __future__ import annotations import dataclasses from datetime import datetime, timezone -import os from fastapi import HTTPException from fastapi.testclient import TestClient -os.environ.setdefault("PORTAL_DATABASE_URL", "postgresql://user:pass@localhost/db") - from ariadne.auth.keycloak import AuthContext import ariadne.app as app_module @@ -47,6 +44,26 @@ def test_startup_and_shutdown(monkeypatch) -> None: app_module._shutdown() +def test_startup_registers_metis_watch(monkeypatch) -> None: + tasks = [] + + monkeypatch.setattr(app_module.provisioning, "start", lambda: None) + monkeypatch.setattr(app_module.scheduler, "start", lambda: None) + monkeypatch.setattr(app_module.scheduler, "stop", lambda: None) + monkeypatch.setattr(app_module.provisioning, "stop", lambda: None) + monkeypatch.setattr(app_module.portal_db, "close", lambda: None) + monkeypatch.setattr(app_module.ariadne_db, "close", lambda: None) + monkeypatch.setattr( + app_module.scheduler, + "add_task", + lambda name, cron_expr, runner: tasks.append((name, cron_expr)), + ) + + app_module._startup() + + assert any(name == "schedule.metis_sentinel_watch" for name, _cron in tasks) + + def test_record_event_handles_exception(monkeypatch) -> None: monkeypatch.setattr(app_module.storage, "record_event", lambda *args, **kwargs: (_ for _ in ()).throw(RuntimeError("fail"))) app_module._record_event("event", {"ok": True}) diff --git a/tests/test_metis.py b/tests/test_metis.py new file mode 100644 index 0000000..5121cdb --- /dev/null +++ b/tests/test_metis.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import json +from pathlib import Path +import types + +from ariadne.services.metis import MetisService + + +def test_watch_sentinel_reads_snapshot_dir(monkeypatch, tmp_path) -> None: + monkeypatch.setattr( + "ariadne.services.metis.settings", + types.SimpleNamespace( + metis_sentinel_dir=str(tmp_path), + metis_sentinel_stale_after_sec=3600.0, + ), + ) + + Path(tmp_path, "node-a.json").write_text( + json.dumps( + { + "hostname": "titan-13", + "kernel": "6.6.63", + "containerd": "1.7.23", + } + ), + encoding="utf-8", + ) + Path(tmp_path, "node-b.json").write_text( + json.dumps( + { + "hostname": "titan-19", + "kernel": "6.6.63", + "containerd": "1.7.23", + } + ), + encoding="utf-8", + ) + + summary = MetisService().watch_sentinel() + + assert summary.status == "ok" + assert summary.snapshots == 2 + assert summary.hosts == 2 + assert summary.hostnames == ["titan-13", "titan-19"] + assert summary.source == str(tmp_path) + + +def test_watch_sentinel_skips_when_unconfigured(monkeypatch) -> None: + monkeypatch.setattr( + "ariadne.services.metis.settings", + types.SimpleNamespace( + metis_sentinel_dir="", + metis_sentinel_stale_after_sec=3600.0, + ), + ) + + summary = MetisService().watch_sentinel() + + assert summary.status == "skipped" + assert summary.snapshots == 0 + assert summary.hosts == 0 diff --git a/tests/test_settings.py b/tests/test_settings.py index 2c3f966..eb13b1b 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -1,6 +1,7 @@ from __future__ import annotations from ariadne import settings as settings_module +from ariadne.settings import Settings def test_env_int_invalid(monkeypatch) -> None: @@ -11,3 +12,14 @@ def test_env_int_invalid(monkeypatch) -> None: def test_env_float_invalid(monkeypatch) -> None: monkeypatch.setenv("ARIADNE_FLOAT_TEST", "bad") assert settings_module._env_float("ARIADNE_FLOAT_TEST", 1.5) == 1.5 + + +def test_from_env_includes_metis_settings(monkeypatch) -> None: + monkeypatch.setenv("METIS_SENTINEL_DIR", "/var/lib/metis/sentinel") + monkeypatch.setenv("METIS_SENTINEL_STALE_AFTER_SEC", "900") + monkeypatch.setenv("ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH", "*/7 * * * *") + + cfg = Settings.from_env() + assert cfg.metis_sentinel_dir == "/var/lib/metis/sentinel" + assert cfg.metis_sentinel_stale_after_sec == 900.0 + assert cfg.metis_sentinel_watch_cron == "*/7 * * * *"