160 lines
6.4 KiB
Python
160 lines
6.4 KiB
Python
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
import time
|
|
from typing import Any
|
|
|
|
from ..k8s.client import get_json, post_json
|
|
from ..settings import settings
|
|
from ..utils.logging import get_logger
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
_SYNC_SCRIPT = """
|
|
set -eu
|
|
token="$(tr -d '\n' < /host/var/lib/rancher/k3s/server/token)"
|
|
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
|
|
VAULT_TOKEN="$(vault write -field=token auth/kubernetes/login role="${VAULT_K8S_ROLE}" jwt="${jwt}")"
|
|
export VAULT_TOKEN
|
|
vault kv put kv/atlas/maintenance/metis-runtime k3s_token="${token}"
|
|
""".strip()
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MetisTokenSyncResult:
|
|
"""Represent a single metis token-sync execution outcome.
|
|
|
|
Inputs: job metadata and completion status gathered from the Kubernetes Job API.
|
|
Outputs: a stable result shape used by scheduler logs/metrics so operators can
|
|
quickly confirm whether token sync completed, is still running, or failed.
|
|
"""
|
|
|
|
job: str
|
|
status: str
|
|
|
|
|
|
class MetisTokenSyncService:
|
|
"""Run metis token synchronization via one-shot Kubernetes Jobs.
|
|
|
|
Inputs: scheduler invocations and runtime settings for namespace, role, and
|
|
node placement.
|
|
Outputs: per-run status that confirms whether Ariadne successfully synced
|
|
the k3s server token into Vault.
|
|
"""
|
|
|
|
def _job_payload(self, job_name: str) -> dict[str, Any]:
|
|
payload: dict[str, Any] = {
|
|
"apiVersion": "batch/v1",
|
|
"kind": "Job",
|
|
"metadata": {
|
|
"name": job_name,
|
|
"namespace": settings.metis_token_sync_namespace,
|
|
"labels": {
|
|
"app": "metis-k3s-token-sync",
|
|
"atlas.bstein.dev/trigger": "ariadne",
|
|
},
|
|
},
|
|
"spec": {
|
|
"backoffLimit": 1,
|
|
"ttlSecondsAfterFinished": settings.metis_token_sync_job_ttl_sec,
|
|
"template": {
|
|
"spec": {
|
|
"serviceAccountName": settings.metis_token_sync_service_account,
|
|
"restartPolicy": "OnFailure",
|
|
"nodeName": settings.metis_token_sync_node_name,
|
|
"tolerations": [
|
|
{
|
|
"key": "node-role.kubernetes.io/control-plane",
|
|
"operator": "Exists",
|
|
"effect": "NoSchedule",
|
|
},
|
|
{
|
|
"key": "node-role.kubernetes.io/master",
|
|
"operator": "Exists",
|
|
"effect": "NoSchedule",
|
|
},
|
|
],
|
|
"containers": [
|
|
{
|
|
"name": "sync",
|
|
"image": settings.metis_token_sync_image,
|
|
"imagePullPolicy": "IfNotPresent",
|
|
"command": ["/bin/sh", "-c"],
|
|
"args": [_SYNC_SCRIPT],
|
|
"env": [
|
|
{"name": "VAULT_ADDR", "value": settings.metis_token_sync_vault_addr},
|
|
{
|
|
"name": "VAULT_K8S_ROLE",
|
|
"value": settings.metis_token_sync_vault_k8s_role,
|
|
},
|
|
],
|
|
"securityContext": {"runAsUser": 0},
|
|
"volumeMounts": [
|
|
{
|
|
"name": "k3s-server",
|
|
"mountPath": "/host/var/lib/rancher/k3s/server",
|
|
"readOnly": True,
|
|
}
|
|
],
|
|
}
|
|
],
|
|
"volumes": [
|
|
{
|
|
"name": "k3s-server",
|
|
"hostPath": {"path": "/var/lib/rancher/k3s/server"},
|
|
}
|
|
],
|
|
}
|
|
},
|
|
},
|
|
}
|
|
return payload
|
|
|
|
def _wait_for_completion(self, job_name: str, timeout_sec: float) -> MetisTokenSyncResult:
|
|
deadline = time.time() + timeout_sec
|
|
while time.time() < deadline:
|
|
job = get_json(
|
|
f"/apis/batch/v1/namespaces/{settings.metis_token_sync_namespace}/jobs/{job_name}"
|
|
)
|
|
status = job.get("status") if isinstance(job.get("status"), dict) else {}
|
|
if int(status.get("succeeded") or 0) > 0:
|
|
return MetisTokenSyncResult(job=job_name, status="ok")
|
|
if int(status.get("failed") or 0) > 0:
|
|
return MetisTokenSyncResult(job=job_name, status="error")
|
|
time.sleep(2)
|
|
return MetisTokenSyncResult(job=job_name, status="running")
|
|
|
|
def run(self, wait: bool = True) -> dict[str, Any]:
|
|
"""Launch and optionally wait on a metis token-sync job.
|
|
|
|
Inputs: `wait` to control synchronous verification.
|
|
Outputs: a JSON-serializable status payload that the scheduler records in
|
|
metrics/event history for operator visibility.
|
|
"""
|
|
|
|
job_name = f"metis-k3s-token-sync-{int(time.time())}"
|
|
created = post_json(
|
|
f"/apis/batch/v1/namespaces/{settings.metis_token_sync_namespace}/jobs",
|
|
self._job_payload(job_name),
|
|
)
|
|
name = created.get("metadata", {}).get("name", job_name)
|
|
logger.info(
|
|
"metis token sync job triggered",
|
|
extra={"event": "metis_token_sync_trigger", "job": name},
|
|
)
|
|
if not wait:
|
|
return {"job": name, "status": "queued"}
|
|
|
|
result = self._wait_for_completion(name, settings.metis_token_sync_wait_timeout_sec)
|
|
if result.status != "ok":
|
|
logger.error(
|
|
"metis token sync job incomplete",
|
|
extra={"event": "metis_token_sync_incomplete", "job": name, "status": result.status},
|
|
)
|
|
raise RuntimeError(f"metis token sync job {name} {result.status}")
|
|
return {"job": result.job, "status": result.status}
|
|
|
|
|
|
metis_token_sync = MetisTokenSyncService()
|