ariadne/ariadne/services/metis_token_sync.py

160 lines
6.4 KiB
Python

from __future__ import annotations
from dataclasses import dataclass
import time
from typing import Any
from ..k8s.client import get_json, post_json
from ..settings import settings
from ..utils.logging import get_logger
logger = get_logger(__name__)
_SYNC_SCRIPT = """
set -eu
token="$(tr -d '\n' < /host/var/lib/rancher/k3s/server/token)"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
VAULT_TOKEN="$(vault write -field=token auth/kubernetes/login role="${VAULT_K8S_ROLE}" jwt="${jwt}")"
export VAULT_TOKEN
vault kv put kv/atlas/maintenance/metis-runtime k3s_token="${token}"
""".strip()
@dataclass(frozen=True)
class MetisTokenSyncResult:
"""Represent a single metis token-sync execution outcome.
Inputs: job metadata and completion status gathered from the Kubernetes Job API.
Outputs: a stable result shape used by scheduler logs/metrics so operators can
quickly confirm whether token sync completed, is still running, or failed.
"""
job: str
status: str
class MetisTokenSyncService:
"""Run metis token synchronization via one-shot Kubernetes Jobs.
Inputs: scheduler invocations and runtime settings for namespace, role, and
node placement.
Outputs: per-run status that confirms whether Ariadne successfully synced
the k3s server token into Vault.
"""
def _job_payload(self, job_name: str) -> dict[str, Any]:
payload: dict[str, Any] = {
"apiVersion": "batch/v1",
"kind": "Job",
"metadata": {
"name": job_name,
"namespace": settings.metis_token_sync_namespace,
"labels": {
"app": "metis-k3s-token-sync",
"atlas.bstein.dev/trigger": "ariadne",
},
},
"spec": {
"backoffLimit": 1,
"ttlSecondsAfterFinished": settings.metis_token_sync_job_ttl_sec,
"template": {
"spec": {
"serviceAccountName": settings.metis_token_sync_service_account,
"restartPolicy": "OnFailure",
"nodeName": settings.metis_token_sync_node_name,
"tolerations": [
{
"key": "node-role.kubernetes.io/control-plane",
"operator": "Exists",
"effect": "NoSchedule",
},
{
"key": "node-role.kubernetes.io/master",
"operator": "Exists",
"effect": "NoSchedule",
},
],
"containers": [
{
"name": "sync",
"image": settings.metis_token_sync_image,
"imagePullPolicy": "IfNotPresent",
"command": ["/bin/sh", "-c"],
"args": [_SYNC_SCRIPT],
"env": [
{"name": "VAULT_ADDR", "value": settings.metis_token_sync_vault_addr},
{
"name": "VAULT_K8S_ROLE",
"value": settings.metis_token_sync_vault_k8s_role,
},
],
"securityContext": {"runAsUser": 0},
"volumeMounts": [
{
"name": "k3s-server",
"mountPath": "/host/var/lib/rancher/k3s/server",
"readOnly": True,
}
],
}
],
"volumes": [
{
"name": "k3s-server",
"hostPath": {"path": "/var/lib/rancher/k3s/server"},
}
],
}
},
},
}
return payload
def _wait_for_completion(self, job_name: str, timeout_sec: float) -> MetisTokenSyncResult:
deadline = time.time() + timeout_sec
while time.time() < deadline:
job = get_json(
f"/apis/batch/v1/namespaces/{settings.metis_token_sync_namespace}/jobs/{job_name}"
)
status = job.get("status") if isinstance(job.get("status"), dict) else {}
if int(status.get("succeeded") or 0) > 0:
return MetisTokenSyncResult(job=job_name, status="ok")
if int(status.get("failed") or 0) > 0:
return MetisTokenSyncResult(job=job_name, status="error")
time.sleep(2)
return MetisTokenSyncResult(job=job_name, status="running")
def run(self, wait: bool = True) -> dict[str, Any]:
"""Launch and optionally wait on a metis token-sync job.
Inputs: `wait` to control synchronous verification.
Outputs: a JSON-serializable status payload that the scheduler records in
metrics/event history for operator visibility.
"""
job_name = f"metis-k3s-token-sync-{int(time.time())}"
created = post_json(
f"/apis/batch/v1/namespaces/{settings.metis_token_sync_namespace}/jobs",
self._job_payload(job_name),
)
name = created.get("metadata", {}).get("name", job_name)
logger.info(
"metis token sync job triggered",
extra={"event": "metis_token_sync_trigger", "job": name},
)
if not wait:
return {"job": name, "status": "queued"}
result = self._wait_for_completion(name, settings.metis_token_sync_wait_timeout_sec)
if result.status != "ok":
logger.error(
"metis token sync job incomplete",
extra={"event": "metis_token_sync_incomplete", "job": name, "status": result.status},
)
raise RuntimeError(f"metis token sync job {name} {result.status}")
return {"job": result.job, "status": result.status}
metis_token_sync = MetisTokenSyncService()