diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index d9883747..ef82775d 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -34,6 +34,9 @@ OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090")) ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "") SNAPSHOT_TTL_SEC = int(os.environ.get("ATLASBOT_SNAPSHOT_TTL_SEC", "30")) +LOGIN_RETRY_CAP_SEC = int(os.environ.get("ATLASBOT_LOGIN_RETRY_CAP_SEC", "60")) +# 0 means retry forever (default); useful during startup when MAS/Synapse ordering is still converging. +LOGIN_MAX_ATTEMPTS = int(os.environ.get("ATLASBOT_LOGIN_MAX_ATTEMPTS", "0")) KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") @@ -5182,14 +5185,21 @@ def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str) history[hist_key] = history[hist_key][-80:] def login_with_retry(user: str, password: str): - last_err = None - for attempt in range(10): + attempts = 0 + while True: try: return login(user, password) except Exception as exc: # noqa: BLE001 - last_err = exc - time.sleep(min(30, 2 ** attempt)) - raise last_err + attempts += 1 + if LOGIN_MAX_ATTEMPTS > 0 and attempts >= LOGIN_MAX_ATTEMPTS: + raise + delay = min(LOGIN_RETRY_CAP_SEC, 2 ** min(attempts, 8)) + print( + f"atlasbot login retry for {normalize_user_id(user)} " + f"(attempt={attempts}, delay={delay}s): {exc}", + flush=True, + ) + time.sleep(delay) def _bot_accounts() -> list[dict[str, str]]: accounts: list[dict[str, str]] = []