atlasbot: keep retrying MAS login during transient Synapse outages

This commit is contained in:
Brad Stein 2026-04-07 13:09:36 -03:00
parent fa160f5f9b
commit cfdd5a377d

View File

@ -34,6 +34,9 @@ OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090")) ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090"))
ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "") ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "")
SNAPSHOT_TTL_SEC = int(os.environ.get("ATLASBOT_SNAPSHOT_TTL_SEC", "30")) SNAPSHOT_TTL_SEC = int(os.environ.get("ATLASBOT_SNAPSHOT_TTL_SEC", "30"))
LOGIN_RETRY_CAP_SEC = int(os.environ.get("ATLASBOT_LOGIN_RETRY_CAP_SEC", "60"))
# 0 means retry forever (default); useful during startup when MAS/Synapse ordering is still converging.
LOGIN_MAX_ATTEMPTS = int(os.environ.get("ATLASBOT_LOGIN_MAX_ATTEMPTS", "0"))
KB_DIR = os.environ.get("KB_DIR", "") KB_DIR = os.environ.get("KB_DIR", "")
VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428")
@ -5182,14 +5185,21 @@ def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str)
history[hist_key] = history[hist_key][-80:] history[hist_key] = history[hist_key][-80:]
def login_with_retry(user: str, password: str): def login_with_retry(user: str, password: str):
last_err = None attempts = 0
for attempt in range(10): while True:
try: try:
return login(user, password) return login(user, password)
except Exception as exc: # noqa: BLE001 except Exception as exc: # noqa: BLE001
last_err = exc attempts += 1
time.sleep(min(30, 2 ** attempt)) if LOGIN_MAX_ATTEMPTS > 0 and attempts >= LOGIN_MAX_ATTEMPTS:
raise last_err raise
delay = min(LOGIN_RETRY_CAP_SEC, 2 ** min(attempts, 8))
print(
f"atlasbot login retry for {normalize_user_id(user)} "
f"(attempt={attempts}, delay={delay}s): {exc}",
flush=True,
)
time.sleep(delay)
def _bot_accounts() -> list[dict[str, str]]: def _bot_accounts() -> list[dict[str, str]]:
accounts: list[dict[str, str]] = [] accounts: list[dict[str, str]] = []