From c5b8396bd89fb7f66a39f483643f2bf4068f0c8d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 17 Jan 2026 02:34:36 -0300 Subject: [PATCH] comms: retry mas jobs and rerun --- services/comms/mas-local-users-ensure-job.yaml | 14 +++++++++++++- services/comms/othrys-kick-numeric-job.yaml | 14 +++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/mas-local-users-ensure-job.yaml index 8dcf8cf..d5c8471 100644 --- a/services/comms/mas-local-users-ensure-job.yaml +++ b/services/comms/mas-local-users-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: mas-local-users-ensure-12 + name: mas-local-users-ensure-13 namespace: comms spec: backoffLimit: 1 @@ -109,6 +109,17 @@ spec: AUTH_BASE = "http://matrix-authentication-service:8080" SERVER_NAME = "live.bstein.dev" + def wait_for_service(url): + last = None + for attempt in range(1, 11): + try: + requests.get(url, timeout=10) + return + except Exception as exc: # noqa: BLE001 + last = exc + time.sleep(attempt * 2) + raise RuntimeError(f"MAS service not reachable: {last}") + def admin_token(): with open(MAS_ADMIN_CLIENT_SECRET_FILE, "r", encoding="utf-8") as f: secret = f.read().strip() @@ -198,6 +209,7 @@ spec: if r.status_code != 200: raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}") + wait_for_service(MAS_ADMIN_API_BASE) token = admin_token() ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"]) ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"]) diff --git a/services/comms/othrys-kick-numeric-job.yaml b/services/comms/othrys-kick-numeric-job.yaml index ed25515..0d3914a 100644 --- a/services/comms/othrys-kick-numeric-job.yaml +++ b/services/comms/othrys-kick-numeric-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: othrys-kick-numeric-7 + name: othrys-kick-numeric-8 namespace: comms spec: backoffLimit: 0 @@ -107,6 +107,17 @@ spec: def auth(token): return {"Authorization": f"Bearer {token}"} + def wait_for_service(url): + last = None + for attempt in range(1, 11): + try: + requests.get(url, timeout=10) + return + except Exception as exc: # noqa: BLE001 + last = exc + time.sleep(attempt * 2) + raise SystemExit(f"MAS service not reachable: {last}") + def login(user, password): r = requests.post( f"{AUTH_BASE}/_matrix/client/v3/login", @@ -154,6 +165,7 @@ spec: if r.status_code not in (200, 202): raise SystemExit(f"kick {user_id} failed: {r.status_code} {r.text}") + wait_for_service(f"{AUTH_BASE}/_matrix/client/versions") token = login(SEEDER_USER, SEEDER_PASS) room_id = resolve_alias(token, ROOM_ALIAS) for user_id in list_members(token, room_id):