feat: add retryable provisioning retries

This commit is contained in:
Brad Stein 2026-01-24 07:12:11 -03:00
parent 632766850e
commit 03bf6f7d9b
4 changed files with 164 additions and 11 deletions

View File

@ -628,6 +628,62 @@ async def deny_access_request(
return JSONResponse({"ok": True, "request_code": row.get("request_code")}) return JSONResponse({"ok": True, "request_code": row.get("request_code")})
@app.post("/api/access/requests/{request_code}/retry")
def retry_access_request(request_code: str) -> JSONResponse:
code = (request_code or "").strip()
if not code:
raise HTTPException(status_code=400, detail="request_code is required")
if not keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
try:
row = portal_db.fetchone(
"SELECT status FROM access_requests WHERE request_code = %s",
(code,),
)
except Exception:
raise HTTPException(status_code=502, detail="failed to load request")
if not row:
raise HTTPException(status_code=404, detail="not found")
status = (row.get("status") or "").strip()
if status not in {"accounts_building", "approved"}:
raise HTTPException(status_code=409, detail="request not retryable")
try:
portal_db.execute(
"UPDATE access_requests SET provision_attempted_at = NULL WHERE request_code = %s",
(code,),
)
portal_db.execute(
"""
UPDATE access_request_tasks
SET status = 'pending',
detail = 'retry requested',
updated_at = NOW()
WHERE request_code = %s AND status = 'error'
""",
(code,),
)
except Exception:
raise HTTPException(status_code=502, detail="failed to update retry state")
threading.Thread(
target=provisioning.provision_access_request,
args=(code,),
daemon=True,
).start()
_record_event(
"access_request_retry",
{
"request_code": code,
"status": "ok",
},
)
return JSONResponse({"ok": True, "request_code": code})
@app.post("/api/account/mailu/rotate") @app.post("/api/account/mailu/rotate")
def rotate_mailu_password(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse: def rotate_mailu_password(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
_require_account_access(ctx) _require_account_access(ctx)

View File

@ -3,6 +3,7 @@ from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
import hashlib import hashlib
import re
import threading import threading
import time import time
from typing import Any from typing import Any
@ -31,6 +32,21 @@ WGER_PASSWORD_UPDATED_ATTR = "wger_password_updated_at"
FIREFLY_PASSWORD_ATTR = "firefly_password" FIREFLY_PASSWORD_ATTR = "firefly_password"
FIREFLY_PASSWORD_UPDATED_ATTR = "firefly_password_updated_at" FIREFLY_PASSWORD_UPDATED_ATTR = "firefly_password_updated_at"
VAULTWARDEN_GRANDFATHERED_FLAG = "vaultwarden_grandfathered" VAULTWARDEN_GRANDFATHERED_FLAG = "vaultwarden_grandfathered"
_RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
_RETRYABLE_TOKENS = (
"timeout",
"temporar",
"rate limited",
"mailbox not ready",
"connection refused",
"connection reset",
"network is unreachable",
"dns",
"name resolution",
"service unavailable",
"bad gateway",
"gateway timeout",
)
logger = get_logger(__name__) logger = get_logger(__name__)
@ -457,6 +473,39 @@ class ProvisioningManager:
self._upsert_task(conn, request_code, task, "pending", detail) self._upsert_task(conn, request_code, task, "pending", detail)
self._record_task(request_code, task, "pending", detail, started) self._record_task(request_code, task, "pending", detail, started)
def _is_retryable_detail(self, detail: str) -> bool:
if not detail:
return False
detail_lower = detail.lower()
match = re.match(r"^http\s+(\d{3})", detail_lower)
if match:
try:
code = int(match.group(1))
except ValueError:
code = 0
if code in _RETRYABLE_HTTP_CODES:
return True
return any(token in detail_lower for token in _RETRYABLE_TOKENS)
def _retryable_detail(self, detail: str) -> str:
cleaned = detail.strip() if isinstance(detail, str) else ""
if not cleaned:
return "retryable: temporary failure"
return f"retryable: {cleaned}"
def _task_fail(
self,
conn,
request_code: str,
task: str,
detail: str,
started: datetime,
) -> None:
if self._is_retryable_detail(detail):
self._task_pending(conn, request_code, task, self._retryable_detail(detail), started)
return
self._task_error(conn, request_code, task, detail, started)
def _vaultwarden_rate_limit_detail(self) -> tuple[str, datetime]: def _vaultwarden_rate_limit_detail(self) -> tuple[str, datetime]:
retry_at = datetime.now(timezone.utc) + timedelta( retry_at = datetime.now(timezone.utc) + timedelta(
seconds=float(settings.vaultwarden_admin_rate_limit_backoff_sec) seconds=float(settings.vaultwarden_admin_rate_limit_backoff_sec)
@ -643,7 +692,7 @@ class ProvisioningManager:
return True return True
except Exception as exc: except Exception as exc:
detail = safe_error_detail(exc, "failed to ensure user") detail = safe_error_detail(exc, "failed to ensure user")
self._task_error(conn, ctx.request_code, "keycloak_user", detail, start) self._task_fail(conn, ctx.request_code, "keycloak_user", detail, start)
return False return False
def _ensure_keycloak_password(self, conn, ctx: RequestContext) -> None: def _ensure_keycloak_password(self, conn, ctx: RequestContext) -> None:
@ -679,7 +728,7 @@ class ProvisioningManager:
raise RuntimeError("initial password missing") raise RuntimeError("initial password missing")
except Exception as exc: except Exception as exc:
detail = safe_error_detail(exc, "failed to set password") detail = safe_error_detail(exc, "failed to set password")
self._task_error(conn, ctx.request_code, "keycloak_password", detail, start) self._task_fail(conn, ctx.request_code, "keycloak_password", detail, start)
def _ensure_keycloak_groups(self, conn, ctx: RequestContext) -> None: def _ensure_keycloak_groups(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc) start = datetime.now(timezone.utc)
@ -694,7 +743,7 @@ class ProvisioningManager:
self._task_ok(conn, ctx.request_code, "keycloak_groups", None, start) self._task_ok(conn, ctx.request_code, "keycloak_groups", None, start)
except Exception as exc: except Exception as exc:
detail = safe_error_detail(exc, "failed to add groups") detail = safe_error_detail(exc, "failed to add groups")
self._task_error(conn, ctx.request_code, "keycloak_groups", detail, start) self._task_fail(conn, ctx.request_code, "keycloak_groups", detail, start)
def _ensure_mailu_app_password(self, conn, ctx: RequestContext) -> None: def _ensure_mailu_app_password(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc) start = datetime.now(timezone.utc)
@ -707,7 +756,7 @@ class ProvisioningManager:
self._task_ok(conn, ctx.request_code, "mailu_app_password", None, start) self._task_ok(conn, ctx.request_code, "mailu_app_password", None, start)
except Exception as exc: except Exception as exc:
detail = safe_error_detail(exc, "failed to set mail password") detail = safe_error_detail(exc, "failed to set mail password")
self._task_error(conn, ctx.request_code, "mailu_app_password", detail, start) self._task_fail(conn, ctx.request_code, "mailu_app_password", detail, start)
def _sync_mailu(self, conn, ctx: RequestContext) -> bool: def _sync_mailu(self, conn, ctx: RequestContext) -> bool:
start = datetime.now(timezone.utc) start = datetime.now(timezone.utc)
@ -727,7 +776,7 @@ class ProvisioningManager:
return True return True
except Exception as exc: except Exception as exc:
detail = safe_error_detail(exc, "failed to sync mailu") detail = safe_error_detail(exc, "failed to sync mailu")
self._task_error(conn, ctx.request_code, "mailu_sync", detail, start) self._task_fail(conn, ctx.request_code, "mailu_sync", detail, start)
return False return False
def _sync_nextcloud_mail(self, conn, ctx: RequestContext) -> None: def _sync_nextcloud_mail(self, conn, ctx: RequestContext) -> None:
@ -749,10 +798,10 @@ class ProvisioningManager:
if not detail and isinstance(result, dict): if not detail and isinstance(result, dict):
detail = str(result.get("detail") or "") detail = str(result.get("detail") or "")
detail = detail or str(status_val) detail = detail or str(status_val)
self._task_error(conn, ctx.request_code, "nextcloud_mail_sync", detail, start) self._task_fail(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
except Exception as exc: except Exception as exc:
detail = safe_error_detail(exc, "failed to sync nextcloud") detail = safe_error_detail(exc, "failed to sync nextcloud")
self._task_error(conn, ctx.request_code, "nextcloud_mail_sync", detail, start) self._task_fail(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
def _ensure_wger_account(self, conn, ctx: RequestContext) -> None: def _ensure_wger_account(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc) start = datetime.now(timezone.utc)
@ -779,7 +828,7 @@ class ProvisioningManager:
self._task_ok(conn, ctx.request_code, "wger_account", None, start) self._task_ok(conn, ctx.request_code, "wger_account", None, start)
except Exception as exc: except Exception as exc:
detail = safe_error_detail(exc, "failed to provision wger") detail = safe_error_detail(exc, "failed to provision wger")
self._task_error(conn, ctx.request_code, "wger_account", detail, start) self._task_fail(conn, ctx.request_code, "wger_account", detail, start)
def _ensure_firefly_account(self, conn, ctx: RequestContext) -> None: def _ensure_firefly_account(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc) start = datetime.now(timezone.utc)
@ -804,7 +853,7 @@ class ProvisioningManager:
self._task_ok(conn, ctx.request_code, "firefly_account", None, start) self._task_ok(conn, ctx.request_code, "firefly_account", None, start)
except Exception as exc: except Exception as exc:
detail = safe_error_detail(exc, "failed to provision firefly") detail = safe_error_detail(exc, "failed to provision firefly")
self._task_error(conn, ctx.request_code, "firefly_account", detail, start) self._task_fail(conn, ctx.request_code, "firefly_account", detail, start)
def _handle_vaultwarden_grandfathered(self, conn, ctx: RequestContext, start: datetime) -> None: def _handle_vaultwarden_grandfathered(self, conn, ctx: RequestContext, start: datetime) -> None:
lookup = vaultwarden.find_user_by_email(ctx.contact_email) lookup = vaultwarden.find_user_by_email(ctx.contact_email)
@ -827,7 +876,7 @@ class ProvisioningManager:
) )
return return
detail = lookup.detail or lookup.status detail = lookup.detail or lookup.status
self._task_error(conn, ctx.request_code, "vaultwarden_invite", detail, start) self._task_fail(conn, ctx.request_code, "vaultwarden_invite", detail, start)
def _ensure_vaultwarden_invite(self, conn, ctx: RequestContext) -> None: def _ensure_vaultwarden_invite(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc) start = datetime.now(timezone.utc)
@ -859,7 +908,7 @@ class ProvisioningManager:
self._set_vaultwarden_attrs(ctx.username, ctx.mailu_email, status) self._set_vaultwarden_attrs(ctx.username, ctx.mailu_email, status)
except Exception as exc: except Exception as exc:
detail = safe_error_detail(exc, "failed to provision vaultwarden") detail = safe_error_detail(exc, "failed to provision vaultwarden")
self._task_error(conn, ctx.request_code, "vaultwarden_invite", detail, start) self._task_fail(conn, ctx.request_code, "vaultwarden_invite", detail, start)
def _send_welcome_email(self, request_code: str, username: str, contact_email: str) -> None: def _send_welcome_email(self, request_code: str, username: str, contact_email: str) -> None:
if not settings.welcome_email_enabled: if not settings.welcome_email_enabled:

View File

@ -110,6 +110,45 @@ def test_account_access_allows_missing_groups(monkeypatch) -> None:
assert resp.status_code != 403 assert resp.status_code != 403
def test_retry_access_request_ok(monkeypatch) -> None:
ctx = AuthContext(username="", email="", groups=[], claims={})
client = _client(monkeypatch, ctx)
executed = []
invoked = {}
monkeypatch.setattr(app_module.keycloak_admin, "ready", lambda: True)
monkeypatch.setattr(app_module.portal_db, "fetchone", lambda *_args, **_kwargs: {"status": "accounts_building"})
monkeypatch.setattr(app_module.portal_db, "execute", lambda query, params=None: executed.append((query, params)))
monkeypatch.setattr(app_module.provisioning, "provision_access_request", lambda code: invoked.setdefault("code", code))
monkeypatch.setattr(app_module, "_record_event", lambda *args, **kwargs: None)
resp = client.post("/api/access/requests/REQ123/retry")
assert resp.status_code == 200
assert resp.json()["request_code"] == "REQ123"
assert invoked["code"] == "REQ123"
assert any("provision_attempted_at" in query for query, _params in executed)
def test_retry_access_request_not_found(monkeypatch) -> None:
ctx = AuthContext(username="", email="", groups=[], claims={})
client = _client(monkeypatch, ctx)
monkeypatch.setattr(app_module.keycloak_admin, "ready", lambda: True)
monkeypatch.setattr(app_module.portal_db, "fetchone", lambda *_args, **_kwargs: None)
resp = client.post("/api/access/requests/REQ123/retry")
assert resp.status_code == 404
def test_retry_access_request_not_retryable(monkeypatch) -> None:
ctx = AuthContext(username="", email="", groups=[], claims={})
client = _client(monkeypatch, ctx)
monkeypatch.setattr(app_module.keycloak_admin, "ready", lambda: True)
monkeypatch.setattr(app_module.portal_db, "fetchone", lambda *_args, **_kwargs: {"status": "ready"})
resp = client.post("/api/access/requests/REQ123/retry")
assert resp.status_code == 409
def test_metrics_endpoint(monkeypatch) -> None: def test_metrics_endpoint(monkeypatch) -> None:
ctx = AuthContext(username="", email="", groups=[], claims={}) ctx = AuthContext(username="", email="", groups=[], claims={})
client = _client(monkeypatch, ctx) client = _client(monkeypatch, ctx)

View File

@ -809,6 +809,15 @@ def test_provisioning_task_helpers() -> None:
assert manager._all_tasks_ok(Conn(), "REQ", ["b"]) is False assert manager._all_tasks_ok(Conn(), "REQ", ["b"]) is False
def test_provisioning_retryable_detail_detection() -> None:
manager = prov.ProvisioningManager(DummyDB({}, locked=True), DummyStorage())
assert manager._is_retryable_detail("timeout") is True
assert manager._is_retryable_detail("http 503: service unavailable") is True
assert manager._is_retryable_detail("mailbox not ready") is True
assert manager._is_retryable_detail("invalid credentials") is False
assert manager._retryable_detail("timeout").startswith("retryable:")
def test_provisioning_ensure_task_rows_empty() -> None: def test_provisioning_ensure_task_rows_empty() -> None:
manager = prov.ProvisioningManager(DummyDB({}), DummyStorage()) manager = prov.ProvisioningManager(DummyDB({}), DummyStorage())
manager._ensure_task_rows(DummyConn({}, locked=True), "REQ", []) manager._ensure_task_rows(DummyConn({}, locked=True), "REQ", [])