feat: add retryable provisioning retries
This commit is contained in:
parent
632766850e
commit
03bf6f7d9b
@ -628,6 +628,62 @@ async def deny_access_request(
|
|||||||
return JSONResponse({"ok": True, "request_code": row.get("request_code")})
|
return JSONResponse({"ok": True, "request_code": row.get("request_code")})
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/access/requests/{request_code}/retry")
|
||||||
|
def retry_access_request(request_code: str) -> JSONResponse:
|
||||||
|
code = (request_code or "").strip()
|
||||||
|
if not code:
|
||||||
|
raise HTTPException(status_code=400, detail="request_code is required")
|
||||||
|
if not keycloak_admin.ready():
|
||||||
|
raise HTTPException(status_code=503, detail="server not configured")
|
||||||
|
|
||||||
|
try:
|
||||||
|
row = portal_db.fetchone(
|
||||||
|
"SELECT status FROM access_requests WHERE request_code = %s",
|
||||||
|
(code,),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
raise HTTPException(status_code=502, detail="failed to load request")
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail="not found")
|
||||||
|
|
||||||
|
status = (row.get("status") or "").strip()
|
||||||
|
if status not in {"accounts_building", "approved"}:
|
||||||
|
raise HTTPException(status_code=409, detail="request not retryable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
portal_db.execute(
|
||||||
|
"UPDATE access_requests SET provision_attempted_at = NULL WHERE request_code = %s",
|
||||||
|
(code,),
|
||||||
|
)
|
||||||
|
portal_db.execute(
|
||||||
|
"""
|
||||||
|
UPDATE access_request_tasks
|
||||||
|
SET status = 'pending',
|
||||||
|
detail = 'retry requested',
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE request_code = %s AND status = 'error'
|
||||||
|
""",
|
||||||
|
(code,),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
raise HTTPException(status_code=502, detail="failed to update retry state")
|
||||||
|
|
||||||
|
threading.Thread(
|
||||||
|
target=provisioning.provision_access_request,
|
||||||
|
args=(code,),
|
||||||
|
daemon=True,
|
||||||
|
).start()
|
||||||
|
_record_event(
|
||||||
|
"access_request_retry",
|
||||||
|
{
|
||||||
|
"request_code": code,
|
||||||
|
"status": "ok",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return JSONResponse({"ok": True, "request_code": code})
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/account/mailu/rotate")
|
@app.post("/api/account/mailu/rotate")
|
||||||
def rotate_mailu_password(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
|
def rotate_mailu_password(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
|
||||||
_require_account_access(ctx)
|
_require_account_access(ctx)
|
||||||
|
|||||||
@ -3,6 +3,7 @@ from __future__ import annotations
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime, timedelta, timezone
|
from datetime import datetime, timedelta, timezone
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import re
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@ -31,6 +32,21 @@ WGER_PASSWORD_UPDATED_ATTR = "wger_password_updated_at"
|
|||||||
FIREFLY_PASSWORD_ATTR = "firefly_password"
|
FIREFLY_PASSWORD_ATTR = "firefly_password"
|
||||||
FIREFLY_PASSWORD_UPDATED_ATTR = "firefly_password_updated_at"
|
FIREFLY_PASSWORD_UPDATED_ATTR = "firefly_password_updated_at"
|
||||||
VAULTWARDEN_GRANDFATHERED_FLAG = "vaultwarden_grandfathered"
|
VAULTWARDEN_GRANDFATHERED_FLAG = "vaultwarden_grandfathered"
|
||||||
|
_RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
|
||||||
|
_RETRYABLE_TOKENS = (
|
||||||
|
"timeout",
|
||||||
|
"temporar",
|
||||||
|
"rate limited",
|
||||||
|
"mailbox not ready",
|
||||||
|
"connection refused",
|
||||||
|
"connection reset",
|
||||||
|
"network is unreachable",
|
||||||
|
"dns",
|
||||||
|
"name resolution",
|
||||||
|
"service unavailable",
|
||||||
|
"bad gateway",
|
||||||
|
"gateway timeout",
|
||||||
|
)
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
@ -457,6 +473,39 @@ class ProvisioningManager:
|
|||||||
self._upsert_task(conn, request_code, task, "pending", detail)
|
self._upsert_task(conn, request_code, task, "pending", detail)
|
||||||
self._record_task(request_code, task, "pending", detail, started)
|
self._record_task(request_code, task, "pending", detail, started)
|
||||||
|
|
||||||
|
def _is_retryable_detail(self, detail: str) -> bool:
|
||||||
|
if not detail:
|
||||||
|
return False
|
||||||
|
detail_lower = detail.lower()
|
||||||
|
match = re.match(r"^http\s+(\d{3})", detail_lower)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
code = int(match.group(1))
|
||||||
|
except ValueError:
|
||||||
|
code = 0
|
||||||
|
if code in _RETRYABLE_HTTP_CODES:
|
||||||
|
return True
|
||||||
|
return any(token in detail_lower for token in _RETRYABLE_TOKENS)
|
||||||
|
|
||||||
|
def _retryable_detail(self, detail: str) -> str:
|
||||||
|
cleaned = detail.strip() if isinstance(detail, str) else ""
|
||||||
|
if not cleaned:
|
||||||
|
return "retryable: temporary failure"
|
||||||
|
return f"retryable: {cleaned}"
|
||||||
|
|
||||||
|
def _task_fail(
|
||||||
|
self,
|
||||||
|
conn,
|
||||||
|
request_code: str,
|
||||||
|
task: str,
|
||||||
|
detail: str,
|
||||||
|
started: datetime,
|
||||||
|
) -> None:
|
||||||
|
if self._is_retryable_detail(detail):
|
||||||
|
self._task_pending(conn, request_code, task, self._retryable_detail(detail), started)
|
||||||
|
return
|
||||||
|
self._task_error(conn, request_code, task, detail, started)
|
||||||
|
|
||||||
def _vaultwarden_rate_limit_detail(self) -> tuple[str, datetime]:
|
def _vaultwarden_rate_limit_detail(self) -> tuple[str, datetime]:
|
||||||
retry_at = datetime.now(timezone.utc) + timedelta(
|
retry_at = datetime.now(timezone.utc) + timedelta(
|
||||||
seconds=float(settings.vaultwarden_admin_rate_limit_backoff_sec)
|
seconds=float(settings.vaultwarden_admin_rate_limit_backoff_sec)
|
||||||
@ -643,7 +692,7 @@ class ProvisioningManager:
|
|||||||
return True
|
return True
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
detail = safe_error_detail(exc, "failed to ensure user")
|
detail = safe_error_detail(exc, "failed to ensure user")
|
||||||
self._task_error(conn, ctx.request_code, "keycloak_user", detail, start)
|
self._task_fail(conn, ctx.request_code, "keycloak_user", detail, start)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _ensure_keycloak_password(self, conn, ctx: RequestContext) -> None:
|
def _ensure_keycloak_password(self, conn, ctx: RequestContext) -> None:
|
||||||
@ -679,7 +728,7 @@ class ProvisioningManager:
|
|||||||
raise RuntimeError("initial password missing")
|
raise RuntimeError("initial password missing")
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
detail = safe_error_detail(exc, "failed to set password")
|
detail = safe_error_detail(exc, "failed to set password")
|
||||||
self._task_error(conn, ctx.request_code, "keycloak_password", detail, start)
|
self._task_fail(conn, ctx.request_code, "keycloak_password", detail, start)
|
||||||
|
|
||||||
def _ensure_keycloak_groups(self, conn, ctx: RequestContext) -> None:
|
def _ensure_keycloak_groups(self, conn, ctx: RequestContext) -> None:
|
||||||
start = datetime.now(timezone.utc)
|
start = datetime.now(timezone.utc)
|
||||||
@ -694,7 +743,7 @@ class ProvisioningManager:
|
|||||||
self._task_ok(conn, ctx.request_code, "keycloak_groups", None, start)
|
self._task_ok(conn, ctx.request_code, "keycloak_groups", None, start)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
detail = safe_error_detail(exc, "failed to add groups")
|
detail = safe_error_detail(exc, "failed to add groups")
|
||||||
self._task_error(conn, ctx.request_code, "keycloak_groups", detail, start)
|
self._task_fail(conn, ctx.request_code, "keycloak_groups", detail, start)
|
||||||
|
|
||||||
def _ensure_mailu_app_password(self, conn, ctx: RequestContext) -> None:
|
def _ensure_mailu_app_password(self, conn, ctx: RequestContext) -> None:
|
||||||
start = datetime.now(timezone.utc)
|
start = datetime.now(timezone.utc)
|
||||||
@ -707,7 +756,7 @@ class ProvisioningManager:
|
|||||||
self._task_ok(conn, ctx.request_code, "mailu_app_password", None, start)
|
self._task_ok(conn, ctx.request_code, "mailu_app_password", None, start)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
detail = safe_error_detail(exc, "failed to set mail password")
|
detail = safe_error_detail(exc, "failed to set mail password")
|
||||||
self._task_error(conn, ctx.request_code, "mailu_app_password", detail, start)
|
self._task_fail(conn, ctx.request_code, "mailu_app_password", detail, start)
|
||||||
|
|
||||||
def _sync_mailu(self, conn, ctx: RequestContext) -> bool:
|
def _sync_mailu(self, conn, ctx: RequestContext) -> bool:
|
||||||
start = datetime.now(timezone.utc)
|
start = datetime.now(timezone.utc)
|
||||||
@ -727,7 +776,7 @@ class ProvisioningManager:
|
|||||||
return True
|
return True
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
detail = safe_error_detail(exc, "failed to sync mailu")
|
detail = safe_error_detail(exc, "failed to sync mailu")
|
||||||
self._task_error(conn, ctx.request_code, "mailu_sync", detail, start)
|
self._task_fail(conn, ctx.request_code, "mailu_sync", detail, start)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _sync_nextcloud_mail(self, conn, ctx: RequestContext) -> None:
|
def _sync_nextcloud_mail(self, conn, ctx: RequestContext) -> None:
|
||||||
@ -749,10 +798,10 @@ class ProvisioningManager:
|
|||||||
if not detail and isinstance(result, dict):
|
if not detail and isinstance(result, dict):
|
||||||
detail = str(result.get("detail") or "")
|
detail = str(result.get("detail") or "")
|
||||||
detail = detail or str(status_val)
|
detail = detail or str(status_val)
|
||||||
self._task_error(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
|
self._task_fail(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
detail = safe_error_detail(exc, "failed to sync nextcloud")
|
detail = safe_error_detail(exc, "failed to sync nextcloud")
|
||||||
self._task_error(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
|
self._task_fail(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
|
||||||
|
|
||||||
def _ensure_wger_account(self, conn, ctx: RequestContext) -> None:
|
def _ensure_wger_account(self, conn, ctx: RequestContext) -> None:
|
||||||
start = datetime.now(timezone.utc)
|
start = datetime.now(timezone.utc)
|
||||||
@ -779,7 +828,7 @@ class ProvisioningManager:
|
|||||||
self._task_ok(conn, ctx.request_code, "wger_account", None, start)
|
self._task_ok(conn, ctx.request_code, "wger_account", None, start)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
detail = safe_error_detail(exc, "failed to provision wger")
|
detail = safe_error_detail(exc, "failed to provision wger")
|
||||||
self._task_error(conn, ctx.request_code, "wger_account", detail, start)
|
self._task_fail(conn, ctx.request_code, "wger_account", detail, start)
|
||||||
|
|
||||||
def _ensure_firefly_account(self, conn, ctx: RequestContext) -> None:
|
def _ensure_firefly_account(self, conn, ctx: RequestContext) -> None:
|
||||||
start = datetime.now(timezone.utc)
|
start = datetime.now(timezone.utc)
|
||||||
@ -804,7 +853,7 @@ class ProvisioningManager:
|
|||||||
self._task_ok(conn, ctx.request_code, "firefly_account", None, start)
|
self._task_ok(conn, ctx.request_code, "firefly_account", None, start)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
detail = safe_error_detail(exc, "failed to provision firefly")
|
detail = safe_error_detail(exc, "failed to provision firefly")
|
||||||
self._task_error(conn, ctx.request_code, "firefly_account", detail, start)
|
self._task_fail(conn, ctx.request_code, "firefly_account", detail, start)
|
||||||
|
|
||||||
def _handle_vaultwarden_grandfathered(self, conn, ctx: RequestContext, start: datetime) -> None:
|
def _handle_vaultwarden_grandfathered(self, conn, ctx: RequestContext, start: datetime) -> None:
|
||||||
lookup = vaultwarden.find_user_by_email(ctx.contact_email)
|
lookup = vaultwarden.find_user_by_email(ctx.contact_email)
|
||||||
@ -827,7 +876,7 @@ class ProvisioningManager:
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
detail = lookup.detail or lookup.status
|
detail = lookup.detail or lookup.status
|
||||||
self._task_error(conn, ctx.request_code, "vaultwarden_invite", detail, start)
|
self._task_fail(conn, ctx.request_code, "vaultwarden_invite", detail, start)
|
||||||
|
|
||||||
def _ensure_vaultwarden_invite(self, conn, ctx: RequestContext) -> None:
|
def _ensure_vaultwarden_invite(self, conn, ctx: RequestContext) -> None:
|
||||||
start = datetime.now(timezone.utc)
|
start = datetime.now(timezone.utc)
|
||||||
@ -859,7 +908,7 @@ class ProvisioningManager:
|
|||||||
self._set_vaultwarden_attrs(ctx.username, ctx.mailu_email, status)
|
self._set_vaultwarden_attrs(ctx.username, ctx.mailu_email, status)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
detail = safe_error_detail(exc, "failed to provision vaultwarden")
|
detail = safe_error_detail(exc, "failed to provision vaultwarden")
|
||||||
self._task_error(conn, ctx.request_code, "vaultwarden_invite", detail, start)
|
self._task_fail(conn, ctx.request_code, "vaultwarden_invite", detail, start)
|
||||||
|
|
||||||
def _send_welcome_email(self, request_code: str, username: str, contact_email: str) -> None:
|
def _send_welcome_email(self, request_code: str, username: str, contact_email: str) -> None:
|
||||||
if not settings.welcome_email_enabled:
|
if not settings.welcome_email_enabled:
|
||||||
|
|||||||
@ -110,6 +110,45 @@ def test_account_access_allows_missing_groups(monkeypatch) -> None:
|
|||||||
assert resp.status_code != 403
|
assert resp.status_code != 403
|
||||||
|
|
||||||
|
|
||||||
|
def test_retry_access_request_ok(monkeypatch) -> None:
|
||||||
|
ctx = AuthContext(username="", email="", groups=[], claims={})
|
||||||
|
client = _client(monkeypatch, ctx)
|
||||||
|
executed = []
|
||||||
|
invoked = {}
|
||||||
|
|
||||||
|
monkeypatch.setattr(app_module.keycloak_admin, "ready", lambda: True)
|
||||||
|
monkeypatch.setattr(app_module.portal_db, "fetchone", lambda *_args, **_kwargs: {"status": "accounts_building"})
|
||||||
|
monkeypatch.setattr(app_module.portal_db, "execute", lambda query, params=None: executed.append((query, params)))
|
||||||
|
monkeypatch.setattr(app_module.provisioning, "provision_access_request", lambda code: invoked.setdefault("code", code))
|
||||||
|
monkeypatch.setattr(app_module, "_record_event", lambda *args, **kwargs: None)
|
||||||
|
|
||||||
|
resp = client.post("/api/access/requests/REQ123/retry")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["request_code"] == "REQ123"
|
||||||
|
assert invoked["code"] == "REQ123"
|
||||||
|
assert any("provision_attempted_at" in query for query, _params in executed)
|
||||||
|
|
||||||
|
|
||||||
|
def test_retry_access_request_not_found(monkeypatch) -> None:
|
||||||
|
ctx = AuthContext(username="", email="", groups=[], claims={})
|
||||||
|
client = _client(monkeypatch, ctx)
|
||||||
|
monkeypatch.setattr(app_module.keycloak_admin, "ready", lambda: True)
|
||||||
|
monkeypatch.setattr(app_module.portal_db, "fetchone", lambda *_args, **_kwargs: None)
|
||||||
|
|
||||||
|
resp = client.post("/api/access/requests/REQ123/retry")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_retry_access_request_not_retryable(monkeypatch) -> None:
|
||||||
|
ctx = AuthContext(username="", email="", groups=[], claims={})
|
||||||
|
client = _client(monkeypatch, ctx)
|
||||||
|
monkeypatch.setattr(app_module.keycloak_admin, "ready", lambda: True)
|
||||||
|
monkeypatch.setattr(app_module.portal_db, "fetchone", lambda *_args, **_kwargs: {"status": "ready"})
|
||||||
|
|
||||||
|
resp = client.post("/api/access/requests/REQ123/retry")
|
||||||
|
assert resp.status_code == 409
|
||||||
|
|
||||||
|
|
||||||
def test_metrics_endpoint(monkeypatch) -> None:
|
def test_metrics_endpoint(monkeypatch) -> None:
|
||||||
ctx = AuthContext(username="", email="", groups=[], claims={})
|
ctx = AuthContext(username="", email="", groups=[], claims={})
|
||||||
client = _client(monkeypatch, ctx)
|
client = _client(monkeypatch, ctx)
|
||||||
|
|||||||
@ -809,6 +809,15 @@ def test_provisioning_task_helpers() -> None:
|
|||||||
assert manager._all_tasks_ok(Conn(), "REQ", ["b"]) is False
|
assert manager._all_tasks_ok(Conn(), "REQ", ["b"]) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_provisioning_retryable_detail_detection() -> None:
|
||||||
|
manager = prov.ProvisioningManager(DummyDB({}, locked=True), DummyStorage())
|
||||||
|
assert manager._is_retryable_detail("timeout") is True
|
||||||
|
assert manager._is_retryable_detail("http 503: service unavailable") is True
|
||||||
|
assert manager._is_retryable_detail("mailbox not ready") is True
|
||||||
|
assert manager._is_retryable_detail("invalid credentials") is False
|
||||||
|
assert manager._retryable_detail("timeout").startswith("retryable:")
|
||||||
|
|
||||||
|
|
||||||
def test_provisioning_ensure_task_rows_empty() -> None:
|
def test_provisioning_ensure_task_rows_empty() -> None:
|
||||||
manager = prov.ProvisioningManager(DummyDB({}), DummyStorage())
|
manager = prov.ProvisioningManager(DummyDB({}), DummyStorage())
|
||||||
manager._ensure_task_rows(DummyConn({}, locked=True), "REQ", [])
|
manager._ensure_task_rows(DummyConn({}, locked=True), "REQ", [])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user