Compare commits

..

105 Commits

Author SHA1 Message Date
codex
18d518b47a security(ariadne): harden image and bump jwt 2026-04-21 23:59:48 -03:00
codex
e22a47b65e ci(ariadne): use preloaded quality scanner image 2026-04-21 22:50:09 -03:00
codex
a3b140e090 ci(ariadne): pass sonar token as login 2026-04-21 22:17:54 -03:00
codex
533e284752 ci(ariadne): run sonar and supply-chain scans 2026-04-21 22:09:05 -03:00
codex
54ffc65165 ci(ariadne): bind sonarqube token credential 2026-04-21 20:16:13 -03:00
codex
831a0fda1b ci(ariadne): use host networking for buildx image builds 2026-04-21 18:48:48 -03:00
codex
e36fc5229d ci(ariadne): enforce per-file coverage contract 2026-04-21 18:40:35 -03:00
codex
bdb9b47291 ci(ariadne): retry buildx bootstrap 2026-04-21 18:36:02 -03:00
codex
9dc76b10f4 ci(ariadne): retry buildx pushes 2026-04-21 18:07:00 -03:00
codex
02a4b852ad ci(ariadne): use mirrored buildkit builder 2026-04-21 17:35:54 -03:00
codex
3649262316 test(ariadne): close scheduler coverage gap 2026-04-21 17:22:48 -03:00
codex
84cc4e6236 ci(ariadne): start docker daemon without tls delay 2026-04-21 17:11:59 -03:00
codex
b21b182199 ci(ariadne): use unique kubernetes agents 2026-04-21 13:47:51 -03:00
codex
e9f48269d3 ci(ariadne): use harbor docker runners 2026-04-21 13:34:21 -03:00
codex
969ec81fd9 ci(ariadne): use harbor python runner 2026-04-21 13:16:49 -03:00
codex
1b0137d984 fix(ariadne): hydrate schedule metrics after restart 2026-04-21 12:39:46 -03:00
codex
949ef2c6ad ci(ariadne): label test metrics with build artifacts 2026-04-21 11:39:00 -03:00
codex
f10540b4a9 ci(ariadne): include primary branch in quality metrics 2026-04-21 11:08:21 -03:00
codex
21407bdf39 ci(ariadne): retrigger after titan-04 cordon 2026-04-21 10:37:33 -03:00
codex
369760c841 ci(ariadne): rebuild missing base image in pipeline 2026-04-21 10:27:06 -03:00
codex
98bf1044b5 ci(ariadne): publish canonical build info 2026-04-21 09:34:51 -03:00
codex
91648526ef ci(ariadne): archive full quality evidence 2026-04-21 09:21:56 -03:00
codex
ec232d0079 test(ariadne): cover final helper edges 2026-04-21 04:54:41 -03:00
codex
19372c8a9a test(ariadne): cover VM trend filters 2026-04-21 04:52:05 -03:00
codex
604b198534 test(ariadne): cover storage snapshot edges 2026-04-21 04:49:30 -03:00
codex
4bbb50f5ad test(ariadne): cover cluster signal filters 2026-04-21 04:47:14 -03:00
codex
f9910c4281 test(ariadne): cover comms protocol helpers 2026-04-21 04:45:18 -03:00
codex
8f349e88b2 test(ariadne): cover VM usage edge inputs 2026-04-21 04:43:11 -03:00
codex
70a08768b4 test(ariadne): cover cluster metric collectors 2026-04-21 04:40:29 -03:00
codex
77fc16a7cb test(ariadne): cover database migration edges 2026-04-21 04:37:47 -03:00
codex
c41ff7ab3b test(ariadne): cover admin route edges 2026-04-21 04:33:56 -03:00
codex
186f7927ba test(ariadne): cover cluster node summary edges 2026-04-21 04:29:51 -03:00
codex
39d671c98d test(ariadne): cover safe error fallbacks 2026-04-21 04:27:12 -03:00
codex
a8b1e5ac7c test(ariadne): cover nextcloud maintenance failures 2026-04-21 04:24:53 -03:00
codex
102eaf8d92 test(ariadne): cover cluster attention edges 2026-04-21 04:21:57 -03:00
codex
236f2a5318 test(ariadne): cover cluster pod summary edges 2026-04-21 04:19:37 -03:00
codex
71c1b9b7bf test(ariadne): cover cluster anomaly edges 2026-04-21 04:16:43 -03:00
codex
13594eba57 test(ariadne): cover comms room operation edges 2026-04-21 04:13:49 -03:00
codex
1434fbedf1 test(ariadne): cover nextcloud mail model edges 2026-04-21 04:10:08 -03:00
codex
86e9a2d82b test(ariadne): cover provisioning account edges 2026-04-21 04:08:02 -03:00
codex
a8a9f04c44 test(ariadne): cover account rotation routes 2026-04-21 04:03:51 -03:00
codex
8fec20e816 test(ariadne): cover Jenkins weather edges 2026-04-21 03:59:18 -03:00
codex
faff23408a test(ariadne): cover Vaultwarden sync edges 2026-04-21 03:56:40 -03:00
codex
160dbd5f3d test(ariadne): cover Vaultwarden service edges 2026-04-21 03:53:30 -03:00
codex
1b2d30e67e test(ariadne): normalize metrics coverage signatures 2026-04-21 03:50:54 -03:00
codex
e6c7b1ab9f test(ariadne): cover Mailu event runner edges 2026-04-21 03:48:59 -03:00
codex
d3ae03f935 test(ariadne): cover cluster health helpers 2026-04-21 03:46:38 -03:00
codex
9965983322 test(ariadne): cover VM client edge parsing 2026-04-21 03:44:43 -03:00
codex
415da50fa1 test(ariadne): cover cluster profile builders 2026-04-21 03:42:16 -03:00
codex
24c3d842c1 test(ariadne): cover Vault service edges 2026-04-21 03:39:31 -03:00
codex
4fec10d1ee test(ariadne): cover Kubernetes exec edges 2026-04-21 03:34:35 -03:00
codex
dc4e76c90d test(ariadne): cover Kubernetes pod selector edges 2026-04-21 03:28:43 -03:00
codex
bca3d87743 test(ariadne): cover cluster relationship edges 2026-04-21 03:25:30 -03:00
codex
03f9118f21 test(ariadne): cover Metis watch edges 2026-04-21 03:20:56 -03:00
codex
40c1a3652b test(ariadne): cover provisioning retry helpers 2026-04-21 03:18:08 -03:00
codex
01ccdd3fcb test(ariadne): cover OpenSearch prune edges 2026-04-21 03:14:59 -03:00
codex
4966cc7f35 test(ariadne): cover Jenkins workspace cleanup edges 2026-04-21 03:12:13 -03:00
codex
2a14d28713 test(ariadne): cover Jenkins workspace candidate filters 2026-04-21 03:09:07 -03:00
codex
0c94ee93ce test(ariadne): cover migration runner 2026-04-21 03:05:07 -03:00
codex
b73e678bfc test(ariadne): cover nextcloud sync edge paths 2026-04-21 03:01:51 -03:00
codex
6b6b9677be test(ariadne): cover cluster state fetcher failures 2026-04-21 02:55:55 -03:00
codex
a17654819c test(ariadne): cover keycloak profile sync edges 2026-04-21 02:53:23 -03:00
codex
9a28ea0086 test(ariadne): cover mailu service edges 2026-04-21 02:50:40 -03:00
codex
ed1fc729d7 test(ariadne): cover firefly and wger edge paths 2026-04-21 02:45:19 -03:00
codex
c07570494a test(ariadne): cover comms guest naming edges 2026-04-21 02:37:42 -03:00
codex
c0ac1e23a7 test(ariadne): cover nextcloud service edges 2026-04-21 02:28:30 -03:00
codex
cbe774acfd test(ariadne): cover cluster state domains 2026-04-21 02:22:47 -03:00
codex
f0e161ba8b test(ariadne): split oversized unit suites 2026-04-21 02:05:59 -03:00
codex
152c19665e refactor(ariadne): split cluster state domains 2026-04-21 02:01:10 -03:00
codex
0fa6138612 refactor(ariadne): split app route registration 2026-04-21 01:50:12 -03:00
codex
18a6471c08 refactor(ariadne): split provisioning workflow helpers 2026-04-21 01:42:56 -03:00
codex
c11996d860 refactor(ariadne): split comms service helpers 2026-04-21 01:36:40 -03:00
codex
7d9b649a43 refactor(ariadne): split jenkins workspace candidates 2026-04-21 01:31:06 -03:00
codex
b5d60fb3be refactor(ariadne): split settings sections 2026-04-21 01:26:42 -03:00
codex
d999b4ff8c refactor(ariadne): split nextcloud mail and maintenance helpers 2026-04-21 01:18:17 -03:00
codex
2477ca3899 refactor(ariadne): split service pod scripts 2026-04-21 01:14:39 -03:00
codex
f0baa619dc refactor(ariadne): split vault policy definitions 2026-04-21 01:12:25 -03:00
codex
63a64661ec quality(ariadne): land doc hygiene baseline 2026-04-21 01:10:20 -03:00
codex
7f284007eb merge master into ariadne hygiene branch 2026-04-21 01:07:44 -03:00
codex
67db7b8438 quality(ariadne): close public docstring hygiene 2026-04-21 01:03:05 -03:00
codex
f95c51e7f5 ci: enforce 30d build and artifact retention 2026-04-20 12:26:21 -03:00
codex
b3c86752e3 ci(ariadne): retrigger metrics publish 2026-04-20 11:06:18 -03:00
codex
20fd0a9f38 ci(ariadne): fallback to discovered junit/coverage paths 2026-04-20 11:01:01 -03:00
codex
7e281e6548 ci(ariadne): always run tests for quality metrics visibility 2026-04-20 10:51:59 -03:00
codex
b9951da1ae ci(ariadne): always run tests for quality metrics visibility 2026-04-20 10:49:30 -03:00
codex
6e1416d1ae ci(ariadne): emit placeholder test-case metric when junit cases are absent 2026-04-20 09:10:48 -03:00
codex
2eadf55557 ci(ariadne): emit per-test case result metrics for flaky tracking 2026-04-20 08:19:45 -03:00
codex
3c157b9523 ci(ariadne): enforce docs gate before loc/coverage and publish docs_naming 2026-04-20 08:12:22 -03:00
codex
c64aca3869 ci: retrigger after jenkins rollout 2026-04-19 21:51:24 -03:00
codex
783b089af2 ci(gate): default sonar and supply checks to observe mode 2026-04-19 21:29:27 -03:00
codex
eb05d0bd50 ci(gate): enforce sonarqube and supply-chain checks 2026-04-19 21:16:02 -03:00
6e2d5ea6ed ci(ariadne): guard optional coverage contract checker 2026-04-19 16:13:22 -03:00
05b788c118 ci(metrics): replace suite payload in Pushgateway via PUT 2026-04-19 16:08:14 -03:00
26cc9333c7 ci(ariadne): restore LOC checker with tracked waivers 2026-04-19 15:04:48 -03:00
a57577e2a5 ci: always publish Ariadne gate metrics even on missing artifacts 2026-04-19 14:40:27 -03:00
bbb958b7c5 ci: add sonar/supply evidence collection and checks metrics 2026-04-19 14:10:41 -03:00
6f4c141d97 ci(metrics): publish checks + platform coverage/loc metrics 2026-04-18 16:32:08 -03:00
eb931e8d46 ci(jenkins): replace weather threshold magic numbers 2026-04-13 01:58:16 -03:00
de2523c313 ci(jenkins): avoid nodes with broken kubelet exec proxy 2026-04-13 01:53:21 -03:00
1094323f1a maintenance(jenkins): export build weather metrics for grafana 2026-04-13 00:25:15 -03:00
27788d307f cleanup(jenkins): detect longhorn orphan volumes via kubernetes status 2026-04-12 14:28:08 -03:00
4cc2f0c355 cleanup(jenkins): gate orphan volume deletes by pvc namespace 2026-04-12 14:24:17 -03:00
2ff3686700 cleanup(jenkins): harden pvc cleanup with dry-run metrics 2026-04-12 12:28:50 -03:00
ad99a83a98 ariadne: split workspace cleanup flow for pedantic lint 2026-04-12 04:57:46 -03:00
1dcc37e8a7 ariadne: add scheduled jenkins workspace pvc cleanup 2026-04-12 04:49:25 -03:00
145 changed files with 19071 additions and 11900 deletions

View File

@ -5,8 +5,9 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
WORKDIR /app
COPY ariadne /app/ariadne
COPY --chown=ariadne:ariadne ariadne /app/ariadne
EXPOSE 8080
USER ariadne
CMD ["uvicorn", "ariadne.app:app", "--host", "0.0.0.0", "--port", "8080"]

View File

@ -6,4 +6,9 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
WORKDIR /app
COPY requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r /app/requirements.txt
RUN pip install --no-cache-dir -r /app/requirements.txt && \
addgroup --system ariadne && \
adduser --system --ingroup ariadne --home /app ariadne && \
chown -R ariadne:ariadne /app
USER ariadne

View File

@ -1,6 +1,10 @@
FROM registry.bstein.dev/bstein/ariadne-base:py312
USER root
WORKDIR /app
COPY requirements-dev.txt /app/requirements-dev.txt
RUN pip install --no-cache-dir -r /app/requirements-dev.txt
RUN pip install --no-cache-dir -r /app/requirements-dev.txt && \
chown -R ariadne:ariadne /app
USER ariadne

267
Jenkinsfile vendored
View File

@ -1,7 +1,6 @@
pipeline {
agent {
kubernetes {
label 'ariadne'
defaultContainer 'builder'
yaml """
apiVersion: v1
@ -17,7 +16,7 @@ spec:
- name: harbor-robot-pipeline
containers:
- name: dind
image: docker:27-dind
image: registry.bstein.dev/bstein/docker:27-dind
securityContext:
privileged: true
env:
@ -27,11 +26,12 @@ spec:
- --mtu=1400
- --host=unix:///var/run/docker.sock
- --host=tcp://0.0.0.0:2375
- --tls=false
volumeMounts:
- name: dind-storage
mountPath: /var/lib/docker
- name: builder
image: docker:27
image: registry.bstein.dev/bstein/docker:27
command: ["cat"]
tty: true
env:
@ -49,7 +49,14 @@ spec:
- name: harbor-config
mountPath: /docker-config
- name: tester
image: python:3.12-slim
image: registry.bstein.dev/bstein/python:3.12-slim
command: ["cat"]
tty: true
volumeMounts:
- name: workspace-volume
mountPath: /home/jenkins/agent
- name: quality-tools
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
command: ["cat"]
tty: true
volumeMounts:
@ -76,14 +83,21 @@ spec:
IMAGE = "${REGISTRY}/ariadne"
VERSION_TAG = 'dev'
SEMVER = 'dev'
COVERAGE_MIN = '99'
COVERAGE_MIN = '95'
COVERAGE_JSON = 'build/coverage.json'
JUNIT_XML = 'build/junit.xml'
SUITE_NAME = 'ariadne'
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
SONARQUBE_PROJECT_KEY = 'ariadne'
SONARQUBE_TOKEN = credentials('sonarqube-token')
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
BUILDKIT_IMAGE = 'registry.bstein.dev/bstein/buildkit:buildx-stable-1'
}
options {
disableConcurrentBuilds()
buildDiscarder(logRotator(daysToKeepStr: '30', numToKeepStr: '200', artifactDaysToKeepStr: '30', artifactNumToKeepStr: '120'))
}
triggers {
pollSCM('H/2 * * * *')
@ -95,22 +109,171 @@ spec:
}
}
stage('Unit tests') {
stage('Collect SonarQube evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
args=(
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
"-Dsonar.login=${SONARQUBE_TOKEN}"
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.sources=."
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**"
"-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
)
[ -f build/coverage.xml ] && args+=("-Dsonar.python.coverage.reportPaths=build/coverage.xml")
set +e
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
rc=${PIPESTATUS[0]}
set -e
printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
'''
}
container('tester') {
sh '''
set -euo pipefail
mkdir -p build
python3 - <<'PY'
import base64
import json
import os
import urllib.parse
import urllib.request
host = os.getenv('SONARQUBE_HOST_URL', '').strip().rstrip('/')
project_key = os.getenv('SONARQUBE_PROJECT_KEY', '').strip()
token = os.getenv('SONARQUBE_TOKEN', '').strip()
report_path = os.getenv('QUALITY_GATE_SONARQUBE_REPORT', 'build/sonarqube-quality-gate.json')
payload = {"status": "ERROR", "note": "missing SONARQUBE_HOST_URL and/or SONARQUBE_PROJECT_KEY"}
if host and project_key:
query = urllib.parse.urlencode({"projectKey": project_key})
request = urllib.request.Request(f"{host}/api/qualitygates/project_status?{query}", method="GET")
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(request, timeout=12) as response:
payload = json.loads(response.read().decode("utf-8"))
except Exception as exc: # noqa: BLE001
payload = {"status": "ERROR", "error": str(exc)}
with open(report_path, "w", encoding="utf-8") as handle:
json.dump(payload, handle, indent=2, sort_keys=True)
handle.write("\\n")
PY
'''
}
}
}
stage('Collect Supply Chain evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
set +e
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
trivy_rc=$?
set -e
if [ ! -s build/trivy-fs.json ]; then
cat > build/ironbank-compliance.json <<EOF
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
EOF
exit 0
fi
critical="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="CRITICAL")] | length' build/trivy-fs.json)"
high="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="HIGH")] | length' build/trivy-fs.json)"
secrets="$(jq '[.Results[]? | .Secrets[]?] | length' build/trivy-fs.json)"
misconfigs="$(jq '[.Results[]? | .Misconfigurations[]? | select(.Status=="FAIL" and (.Severity=="CRITICAL" or .Severity=="HIGH"))] | length' build/trivy-fs.json)"
status=ok
compliant=true
if [ "${critical}" -gt 0 ] || [ "${secrets}" -gt 0 ] || [ "${misconfigs}" -gt 0 ]; then
status=failed
compliant=false
fi
jq -n --arg status "${status}" --argjson compliant "${compliant}" --argjson critical "${critical}" --argjson high "${high}" --argjson secrets "${secrets}" --argjson misconfigs "${misconfigs}" --argjson trivy_rc "${trivy_rc}" \
'{status:$status, compliant:$compliant, category:"artifact_security", scan_type:"filesystem", scanner:"trivy", critical_vulnerabilities:$critical, high_vulnerabilities:$high, secrets:$secrets, high_or_critical_misconfigurations:$misconfigs, trivy_rc:$trivy_rc, high_vulnerability_policy:"observe"}' > build/ironbank-compliance.json
'''
}
container('tester') {
sh '''
set -euo pipefail
mkdir -p build
python3 - <<'PY'
import json
import os
from pathlib import Path
report_path = Path(os.getenv('QUALITY_GATE_IRONBANK_REPORT', 'build/ironbank-compliance.json'))
if report_path.exists():
raise SystemExit(0)
status = os.getenv('IRONBANK_COMPLIANCE_STATUS', '').strip()
compliant = os.getenv('IRONBANK_COMPLIANT', '').strip().lower()
payload = {"status": status or "unknown", "compliant": compliant in {"1", "true", "yes", "on"} if compliant else None}
payload = {k: v for k, v in payload.items() if v is not None}
if "status" not in payload:
payload["status"] = "unknown"
payload["note"] = "Set IRONBANK_COMPLIANCE_STATUS/IRONBANK_COMPLIANT or write build/ironbank-compliance.json in image-building repos."
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\\n", encoding="utf-8")
PY
'''
}
}
}
stage('Run quality gate') {
steps {
container('tester') {
sh(script: '''
set -euo pipefail
mkdir -p build
set +e
python -m pip install --no-cache-dir -r requirements.txt -r requirements-dev.txt
python -m ruff check ariadne --select PLR
python scripts/check_file_sizes.py --roots ariadne scripts tests --max-lines 500 --waivers scripts/loc_hygiene_waivers.tsv
python -m slipcover \
--json \
--out "${COVERAGE_JSON}" \
--source ariadne \
--fail-under "${COVERAGE_MIN}" \
-m pytest -ra -vv --durations=20 --junitxml "${JUNIT_XML}"
python -c "import json; payload=json.load(open('build/coverage.json', encoding='utf-8')); percent=(payload.get('summary') or {}).get('percent_covered'); print(f'Coverage summary: {percent:.2f}%' if percent is not None else 'Coverage summary unavailable')"
install_rc=$?
docs_rc=1
lint_rc=1
loc_rc=1
tests_rc=1
coverage_contract_rc=0
gate_rc=1
if [ "${install_rc}" -eq 0 ]; then
python scripts/check_docstrings.py --root ariadne
docs_rc=$?
python -m ruff check ariadne scripts --select PLR
lint_rc=$?
python scripts/check_file_sizes.py --roots ariadne scripts tests --max-lines 500 --waivers ci/loc_hygiene_waivers.tsv
loc_rc=$?
python -m slipcover \
--json \
--out "${COVERAGE_JSON}" \
--source ariadne \
--fail-under "${COVERAGE_MIN}" \
-m pytest -ra -vv --durations=20 --junitxml "${JUNIT_XML}"
tests_rc=$?
python -c "import json; payload=json.load(open('build/coverage.json', encoding='utf-8')); percent=(payload.get('summary') or {}).get('percent_covered'); print(f'Coverage summary: {percent:.2f}%' if percent is not None else 'Coverage summary unavailable')" || true
if [ -f "${COVERAGE_JSON}" ] && [ -f scripts/check_coverage_contract.py ]; then
python scripts/check_coverage_contract.py "${COVERAGE_JSON}" --source-root ariadne --threshold "${COVERAGE_MIN}"
coverage_contract_rc=$?
else
echo "coverage contract check skipped: checker or coverage report missing"
fi
fi
printf '%s\n' "${docs_rc}" > build/docs-naming.rc
if [ "${install_rc}" -eq 0 ]; then
gate_rc=0
[ "${docs_rc}" -eq 0 ] || gate_rc=1
[ "${lint_rc}" -eq 0 ] || gate_rc=1
[ "${loc_rc}" -eq 0 ] || gate_rc=1
[ "${tests_rc}" -eq 0 ] || gate_rc=1
[ "${coverage_contract_rc}" -eq 0 ] || gate_rc=1
fi
set -e
printf '%s\n' "${gate_rc}" > build/quality-gate.rc
'''.stripIndent())
}
}
@ -121,7 +284,18 @@ python -c "import json; payload=json.load(open('build/coverage.json', encoding='
container('tester') {
sh '''
set -euo pipefail
python scripts/publish_test_metrics.py
python scripts/publish_test_metrics.py || true
'''
}
}
}
stage('Enforce quality gate') {
steps {
container('tester') {
sh '''
set -euo pipefail
test "$(cat build/quality-gate.rc 2>/dev/null || echo 1)" -eq 0
'''
}
}
@ -177,21 +351,76 @@ python -c "import json; payload=json.load(open('build/coverage.json', encoding='
fi
BUILDER_NAME="ariadne-${BUILD_NUMBER}"
docker buildx rm "${BUILDER_NAME}" >/dev/null 2>&1 || true
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --bootstrap --use
attempt=1
while [ "${attempt}" -le 3 ]; do
if docker buildx create --name "${BUILDER_NAME}" --driver docker-container --driver-opt "image=${BUILDKIT_IMAGE}" --bootstrap --use; then
break
fi
docker buildx rm "${BUILDER_NAME}" >/dev/null 2>&1 || true
echo "buildx bootstrap attempt ${attempt}/3 failed; retrying after registry/network backoff" >&2
sleep $((attempt * 15))
attempt=$((attempt + 1))
done
if [ "${attempt}" -gt 3 ]; then
echo "buildx bootstrap failed after retries" >&2
exit 1
fi
docker buildx inspect "${BUILDER_NAME}" --bootstrap
'''
}
}
}
stage('Build & push base image') {
steps {
container('builder') {
sh '''
set -euo pipefail
retry_buildx() {
attempt=1
while [ "${attempt}" -le 3 ]; do
if docker buildx build "$@"; then
return 0
fi
echo "buildx attempt ${attempt}/3 failed; retrying after registry/network backoff" >&2
sleep $((attempt * 15))
attempt=$((attempt + 1))
done
return 1
}
retry_buildx \
--platform linux/arm64 \
--network host \
--file Dockerfile.base \
--tag "${REGISTRY}/ariadne-base:py312" \
--push \
.
'''
}
}
}
stage('Build & push image') {
steps {
container('builder') {
sh '''
set -euo pipefail
VERSION_TAG="$(cut -d= -f2 build.env)"
docker buildx build \
retry_buildx() {
attempt=1
while [ "${attempt}" -le 3 ]; do
if docker buildx build "$@"; then
return 0
fi
echo "buildx attempt ${attempt}/3 failed; retrying after registry/network backoff" >&2
sleep $((attempt * 15))
attempt=$((attempt + 1))
done
return 1
}
retry_buildx \
--platform linux/arm64 \
--network host \
--tag "${IMAGE}:${VERSION_TAG}" \
--tag "${IMAGE}:latest" \
--push \
@ -213,7 +442,7 @@ python -c "import json; payload=json.load(open('build/coverage.json', encoding='
}
}
}
archiveArtifacts artifacts: 'build/junit.xml,build/coverage.json', allowEmptyArchive: true, fingerprint: true
archiveArtifacts artifacts: 'build/**', allowEmptyArchive: true, fingerprint: true
script {
def props = fileExists('build.env') ? readProperties(file: 'build.env') : [:]
echo "Build complete for ${props['SEMVER'] ?: env.VERSION_TAG}"

View File

@ -1,69 +1,49 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
import json
import threading
from typing import Any, Callable
import sys
from typing import Any
from fastapi import Body, Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse, Response
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
from .app_account_routes import _register_account_routes
from .app_admin_routes import _register_admin_routes
from .auth.keycloak import AuthContext, authenticator
from .db.database import Database, DatabaseConfig
from .db.storage import Storage, TaskRunRecord
from .db.storage import Storage
from .manager.provisioning import ProvisioningManager
from .metrics.metrics import record_task_run
from .scheduler.cron import CronScheduler
from .services.cluster_state import run_cluster_state
from .services.comms import comms
from .services.firefly import firefly
from .services.image_sweeper import image_sweeper
from .services.jenkins_build_weather import collect_jenkins_build_weather
from .services.jenkins_workspace_cleanup import cleanup_jenkins_workspace_storage
from .services.keycloak_admin import keycloak_admin
from .services.keycloak_profile import run_profile_sync
from .services.mailu import mailu
from .services.mailu_events import mailu_events
from .services.nextcloud import nextcloud
from .services.image_sweeper import image_sweeper
from .services.metis import metis
from .services.metis_token_sync import metis_token_sync
from .services.nextcloud import nextcloud
from .services.opensearch_prune import prune_indices
from .services.platform_quality_probe import platform_quality_probe
from .services.pod_cleaner import clean_finished_pods
from .services.vaultwarden_sync import run_vaultwarden_sync
from .services.vault import vault
from .services.vaultwarden_sync import run_vaultwarden_sync
from .services.wger import wger
from .settings import settings
from .utils.errors import safe_error_detail
from .utils.http import extract_bearer_token
from .utils.logging import LogConfig, configure_logging, get_logger, task_context
from .utils.logging import LogConfig, configure_logging, get_logger
from .utils.passwords import random_password
configure_logging(LogConfig(level=settings.log_level))
logger = get_logger(__name__)
@dataclass(frozen=True)
class AccountTaskContext:
task_name: str
username: str
started: datetime
extra: dict[str, Any] | None = None
@dataclass(frozen=True)
class PasswordResetRequest:
task_name: str
service_label: str
username: str
mailu_email: str
password: str
sync_fn: Callable[[], dict[str, Any]]
password_attr: str
updated_attr: str
error_hint: str
portal_db = Database(
settings.portal_database_url,
DatabaseConfig(
@ -91,6 +71,7 @@ ariadne_db = Database(
storage = Storage(ariadne_db, portal_db)
provisioning = ProvisioningManager(portal_db, storage)
scheduler = CronScheduler(storage, settings.schedule_tick_sec)
app = FastAPI(title=settings.app_name)
def _record_event(event_type: str, detail: dict[str, Any] | str | None) -> None:
@ -109,9 +90,6 @@ def _parse_event_detail(detail: str | None) -> Any:
return detail
app = FastAPI(title=settings.app_name)
def _require_auth(request: Request) -> AuthContext:
token = extract_bearer_token(request)
if not token:
@ -167,92 +145,8 @@ def _allowed_flag_groups() -> list[str]:
return settings.allowed_flag_groups
def _resolve_mailu_email(username: str) -> str:
mailu_email = f"{username}@{settings.mailu_domain}"
try:
user = keycloak_admin.find_user(username) or {}
attrs = user.get("attributes") if isinstance(user, dict) else None
if isinstance(attrs, dict):
raw_mailu = attrs.get("mailu_email")
if isinstance(raw_mailu, list) and raw_mailu:
return str(raw_mailu[0])
if isinstance(raw_mailu, str) and raw_mailu:
return raw_mailu
except Exception:
return mailu_email
return mailu_email
def _record_account_task(ctx: AccountTaskContext, status: str, error_detail: str) -> None:
finished = datetime.now(timezone.utc)
duration_sec = (finished - ctx.started).total_seconds()
record_task_run(ctx.task_name, status, duration_sec)
try:
storage.record_task_run(
TaskRunRecord(
request_code=None,
task=ctx.task_name,
status=status,
detail=error_detail or None,
started_at=ctx.started,
finished_at=finished,
duration_ms=int(duration_sec * 1000),
)
)
except Exception:
pass
detail = {"username": ctx.username, "status": status, "error": error_detail}
if ctx.extra:
detail.update(ctx.extra)
_record_event(ctx.task_name, detail)
def _run_password_reset(request: PasswordResetRequest) -> JSONResponse:
started = datetime.now(timezone.utc)
task_ctx = AccountTaskContext(
task_name=request.task_name,
username=request.username,
started=started,
extra={"mailu_email": request.mailu_email},
)
status = "ok"
error_detail = ""
logger.info(
f"{request.service_label} password reset requested",
extra={"event": request.task_name, "username": request.username},
)
try:
result = request.sync_fn()
status_val = result.get("status") if isinstance(result, dict) else "error"
if status_val != "ok":
raise RuntimeError(f"{request.service_label} sync {status_val}")
keycloak_admin.set_user_attribute(
request.username,
request.password_attr,
request.password,
)
keycloak_admin.set_user_attribute(
request.username,
request.updated_attr,
datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
)
logger.info(
f"{request.service_label} password reset completed",
extra={"event": request.task_name, "username": request.username},
)
return JSONResponse({"status": "ok", "password": request.password})
except HTTPException as exc:
status = "error"
error_detail = str(exc.detail)
raise
except Exception as exc:
status = "error"
error_detail = safe_error_detail(exc, request.error_hint)
raise HTTPException(status_code=502, detail=error_detail)
finally:
_record_account_task(task_ctx, status, error_detail)
def _app_module() -> Any:
return sys.modules[__name__]
@app.on_event("startup")
@ -260,108 +154,34 @@ def _startup() -> None:
provisioning.start()
scheduler.add_task("schedule.mailu_sync", settings.mailu_sync_cron, lambda: mailu.sync("ariadne_schedule"))
scheduler.add_task(
"schedule.nextcloud_sync",
settings.nextcloud_sync_cron,
lambda: nextcloud.sync_mail(wait=False),
)
scheduler.add_task(
"schedule.nextcloud_cron",
settings.nextcloud_cron,
lambda: nextcloud.run_cron(),
)
scheduler.add_task(
"schedule.nextcloud_maintenance",
settings.nextcloud_maintenance_cron,
lambda: nextcloud.run_maintenance(),
)
scheduler.add_task("schedule.nextcloud_sync", settings.nextcloud_sync_cron, lambda: nextcloud.sync_mail(wait=False))
scheduler.add_task("schedule.nextcloud_cron", settings.nextcloud_cron, lambda: nextcloud.run_cron())
scheduler.add_task("schedule.nextcloud_maintenance", settings.nextcloud_maintenance_cron, lambda: nextcloud.run_maintenance())
scheduler.add_task("schedule.vaultwarden_sync", settings.vaultwarden_sync_cron, run_vaultwarden_sync)
scheduler.add_task(
"schedule.keycloak_profile",
settings.keycloak_profile_cron,
run_profile_sync,
)
scheduler.add_task(
"schedule.wger_user_sync",
settings.wger_user_sync_cron,
lambda: wger.sync_users(),
)
scheduler.add_task("schedule.keycloak_profile", settings.keycloak_profile_cron, run_profile_sync)
scheduler.add_task("schedule.wger_user_sync", settings.wger_user_sync_cron, lambda: wger.sync_users())
scheduler.add_task("schedule.wger_admin", settings.wger_admin_cron, lambda: wger.ensure_admin(wait=False))
scheduler.add_task(
"schedule.firefly_user_sync",
settings.firefly_user_sync_cron,
lambda: firefly.sync_users(),
)
scheduler.add_task(
"schedule.firefly_cron",
settings.firefly_cron,
lambda: firefly.run_cron(),
)
scheduler.add_task(
"schedule.pod_cleaner",
settings.pod_cleaner_cron,
clean_finished_pods,
)
scheduler.add_task(
"schedule.opensearch_prune",
settings.opensearch_prune_cron,
prune_indices,
)
scheduler.add_task(
"schedule.image_sweeper",
settings.image_sweeper_cron,
lambda: image_sweeper.run(wait=True),
)
scheduler.add_task(
"schedule.metis_sentinel_watch",
settings.metis_sentinel_watch_cron,
lambda: metis.watch_sentinel(),
)
scheduler.add_task(
"schedule.metis_k3s_token_sync",
settings.metis_k3s_token_sync_cron,
lambda: metis_token_sync.run(wait=True),
)
scheduler.add_task("schedule.firefly_user_sync", settings.firefly_user_sync_cron, lambda: firefly.sync_users())
scheduler.add_task("schedule.firefly_cron", settings.firefly_cron, lambda: firefly.run_cron())
scheduler.add_task("schedule.pod_cleaner", settings.pod_cleaner_cron, clean_finished_pods)
scheduler.add_task("schedule.opensearch_prune", settings.opensearch_prune_cron, prune_indices)
scheduler.add_task("schedule.image_sweeper", settings.image_sweeper_cron, lambda: image_sweeper.run(wait=True))
scheduler.add_task("schedule.metis_sentinel_watch", settings.metis_sentinel_watch_cron, lambda: metis.watch_sentinel())
scheduler.add_task("schedule.metis_k3s_token_sync", settings.metis_k3s_token_sync_cron, lambda: metis_token_sync.run(wait=True))
scheduler.add_task(
"schedule.platform_quality_suite_probe",
settings.platform_quality_suite_probe_cron,
lambda: platform_quality_probe.run(wait=True),
)
scheduler.add_task(
"schedule.vault_k8s_auth",
settings.vault_k8s_auth_cron,
lambda: vault.sync_k8s_auth(wait=True),
)
scheduler.add_task(
"schedule.vault_oidc",
settings.vault_oidc_cron,
lambda: vault.sync_oidc(wait=True),
)
scheduler.add_task(
"schedule.comms_guest_name",
settings.comms_guest_name_cron,
lambda: comms.run_guest_name_randomizer(wait=True),
)
scheduler.add_task(
"schedule.comms_pin_invite",
settings.comms_pin_invite_cron,
lambda: comms.run_pin_invite(wait=True),
)
scheduler.add_task(
"schedule.comms_reset_room",
settings.comms_reset_room_cron,
lambda: comms.run_reset_room(wait=True),
)
scheduler.add_task(
"schedule.comms_seed_room",
settings.comms_seed_room_cron,
lambda: comms.run_seed_room(wait=True),
)
scheduler.add_task(
"schedule.cluster_state",
settings.cluster_state_cron,
lambda: run_cluster_state(storage),
)
scheduler.add_task("schedule.jenkins_build_weather", settings.jenkins_build_weather_cron, collect_jenkins_build_weather)
scheduler.add_task("schedule.jenkins_workspace_cleanup", settings.jenkins_workspace_cleanup_cron, cleanup_jenkins_workspace_storage)
scheduler.add_task("schedule.vault_k8s_auth", settings.vault_k8s_auth_cron, lambda: vault.sync_k8s_auth(wait=True))
scheduler.add_task("schedule.vault_oidc", settings.vault_oidc_cron, lambda: vault.sync_oidc(wait=True))
scheduler.add_task("schedule.comms_guest_name", settings.comms_guest_name_cron, lambda: comms.run_guest_name_randomizer(wait=True))
scheduler.add_task("schedule.comms_pin_invite", settings.comms_pin_invite_cron, lambda: comms.run_pin_invite(wait=True))
scheduler.add_task("schedule.comms_reset_room", settings.comms_reset_room_cron, lambda: comms.run_reset_room(wait=True))
scheduler.add_task("schedule.comms_seed_room", settings.comms_seed_room_cron, lambda: comms.run_seed_room(wait=True))
scheduler.add_task("schedule.cluster_state", settings.cluster_state_cron, lambda: run_cluster_state(storage))
scheduler.start()
logger.info(
"ariadne started",
@ -382,6 +202,11 @@ def _startup() -> None:
"metis_sentinel_watch_cron": settings.metis_sentinel_watch_cron,
"metis_k3s_token_sync_cron": settings.metis_k3s_token_sync_cron,
"platform_quality_suite_probe_cron": settings.platform_quality_suite_probe_cron,
"jenkins_build_weather_cron": settings.jenkins_build_weather_cron,
"jenkins_base_url": settings.jenkins_base_url,
"jenkins_workspace_cleanup_cron": settings.jenkins_workspace_cleanup_cron,
"jenkins_workspace_cleanup_dry_run": settings.jenkins_workspace_cleanup_dry_run,
"jenkins_workspace_cleanup_max_deletions_per_run": settings.jenkins_workspace_cleanup_max_deletions_per_run,
"vault_k8s_auth_cron": settings.vault_k8s_auth_cron,
"vault_oidc_cron": settings.vault_oidc_cron,
"comms_guest_name_cron": settings.comms_guest_name_cron,
@ -405,591 +230,26 @@ def _shutdown() -> None:
@app.get("/health")
def health() -> dict[str, Any]:
"""Return a minimal liveness response for probes and operators."""
return {"ok": True}
@app.get(settings.metrics_path)
def metrics() -> Response:
"""Expose Prometheus metrics generated by Ariadne runtime tasks."""
payload = generate_latest()
return Response(payload, media_type=CONTENT_TYPE_LATEST)
@app.get("/api/admin/access/requests")
def list_access_requests(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
_require_admin(ctx)
logger.info(
"list access requests",
extra={"event": "access_requests_list", "actor": ctx.username or ""},
)
try:
rows = storage.list_pending_requests()
except Exception:
raise HTTPException(status_code=502, detail="failed to load requests")
output: list[dict[str, Any]] = []
for row in rows:
created_at = row.get("created_at")
output.append(
{
"id": row.get("request_code"),
"username": row.get("username"),
"email": row.get("contact_email") or "",
"first_name": row.get("first_name") or "",
"last_name": row.get("last_name") or "",
"request_code": row.get("request_code"),
"created_at": created_at.isoformat() if isinstance(created_at, datetime) else "",
"note": row.get("note") or "",
}
)
return JSONResponse({"requests": output})
@app.get("/api/admin/access/flags")
def list_access_flags(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
_require_admin(ctx)
flags = settings.allowed_flag_groups
if keycloak_admin.ready():
try:
flags = keycloak_admin.list_group_names(exclude={"admin"})
except Exception:
flags = settings.allowed_flag_groups
return JSONResponse({"flags": flags})
@app.get("/api/admin/audit/events")
def list_audit_events(
limit: int = 200,
event_type: str | None = None,
ctx: AuthContext = Depends(_require_auth),
) -> JSONResponse:
_require_admin(ctx)
try:
rows = storage.list_events(limit=limit, event_type=event_type)
except Exception:
raise HTTPException(status_code=502, detail="failed to load audit events")
output: list[dict[str, Any]] = []
for row in rows:
created_at = row.get("created_at")
output.append(
{
"id": row.get("id"),
"event_type": row.get("event_type"),
"detail": _parse_event_detail(row.get("detail")),
"created_at": created_at.isoformat() if isinstance(created_at, datetime) else "",
}
)
return JSONResponse({"events": output})
@app.get("/api/admin/audit/task-runs")
def list_audit_task_runs(
limit: int = 200,
request_code: str | None = None,
task: str | None = None,
ctx: AuthContext = Depends(_require_auth),
) -> JSONResponse:
_require_admin(ctx)
try:
rows = storage.list_task_runs(limit=limit, request_code=request_code, task=task)
except Exception:
raise HTTPException(status_code=502, detail="failed to load task runs")
output: list[dict[str, Any]] = []
for row in rows:
started_at = row.get("started_at")
finished_at = row.get("finished_at")
output.append(
{
"id": row.get("id"),
"request_code": row.get("request_code") or "",
"task": row.get("task") or "",
"status": row.get("status") or "",
"detail": _parse_event_detail(row.get("detail")),
"started_at": started_at.isoformat() if isinstance(started_at, datetime) else "",
"finished_at": finished_at.isoformat() if isinstance(finished_at, datetime) else "",
"duration_ms": row.get("duration_ms"),
}
)
return JSONResponse({"task_runs": output})
@app.get("/api/admin/cluster/state")
def get_cluster_state(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
_require_admin(ctx)
snapshot = storage.latest_cluster_state()
if not snapshot:
raise HTTPException(status_code=404, detail="cluster state unavailable")
return JSONResponse(snapshot)
@app.get("/api/internal/cluster/state")
def get_cluster_state_internal() -> JSONResponse:
snapshot = storage.latest_cluster_state()
if not snapshot:
raise HTTPException(status_code=404, detail="cluster state unavailable")
return JSONResponse(snapshot)
@app.post("/api/admin/access/requests/{username}/approve")
async def approve_access_request(
username: str,
request: Request,
ctx: AuthContext = Depends(_require_auth),
) -> JSONResponse:
_require_admin(ctx)
with task_context("admin.access.approve"):
payload = await _read_json_payload(request)
allowed_flags = _allowed_flag_groups()
flags = [flag for flag in _flags_from_payload(payload) if flag in allowed_flags]
note = _note_from_payload(payload)
decided_by = ctx.username or ""
try:
row = portal_db.fetchone(
"""
UPDATE access_requests
SET status = 'approved',
decided_at = NOW(),
decided_by = %s,
approval_flags = %s,
approval_note = %s
WHERE username = %s
AND status = 'pending'
AND email_verified_at IS NOT NULL
RETURNING request_code
""",
(decided_by or None, flags or None, note, username),
)
except Exception:
raise HTTPException(status_code=502, detail="failed to approve request")
if not row:
logger.info(
"access request approval ignored",
extra={"event": "access_request_approve", "actor": decided_by, "username": username, "status": "skipped"},
)
_record_event(
"access_request_approve",
{
"actor": decided_by,
"username": username,
"status": "skipped",
},
)
return JSONResponse({"ok": True, "request_code": ""})
request_code = row.get("request_code") or ""
if request_code:
threading.Thread(
target=provisioning.provision_access_request,
args=(request_code,),
daemon=True,
).start()
logger.info(
"access request approved",
extra={
"event": "access_request_approve",
"actor": decided_by,
"username": username,
"request_code": request_code,
},
)
_record_event(
"access_request_approve",
{
"actor": decided_by,
"username": username,
"request_code": request_code,
"status": "ok",
"flags": flags,
"note": note or "",
},
)
return JSONResponse({"ok": True, "request_code": request_code})
@app.post("/api/admin/access/requests/{username}/deny")
async def deny_access_request(
username: str,
request: Request,
ctx: AuthContext = Depends(_require_auth),
) -> JSONResponse:
_require_admin(ctx)
with task_context("admin.access.deny"):
payload = await _read_json_payload(request)
note = _note_from_payload(payload)
decided_by = ctx.username or ""
try:
row = portal_db.fetchone(
"""
UPDATE access_requests
SET status = 'denied',
decided_at = NOW(),
decided_by = %s,
denial_note = %s
WHERE username = %s AND status = 'pending'
RETURNING request_code
""",
(decided_by or None, note, username),
)
except Exception:
raise HTTPException(status_code=502, detail="failed to deny request")
if not row:
logger.info(
"access request denial ignored",
extra={"event": "access_request_deny", "actor": decided_by, "username": username, "status": "skipped"},
)
_record_event(
"access_request_deny",
{
"actor": decided_by,
"username": username,
"status": "skipped",
},
)
return JSONResponse({"ok": True, "request_code": ""})
logger.info(
"access request denied",
extra={
"event": "access_request_deny",
"actor": decided_by,
"username": username,
"request_code": row.get("request_code") or "",
},
)
_record_event(
"access_request_deny",
{
"actor": decided_by,
"username": username,
"request_code": row.get("request_code") or "",
"status": "ok",
"note": note or "",
},
)
return JSONResponse({"ok": True, "request_code": row.get("request_code")})
@app.post("/api/access/requests/{request_code}/retry")
def retry_access_request(request_code: str) -> JSONResponse:
code = (request_code or "").strip()
if not code:
raise HTTPException(status_code=400, detail="request_code is required")
if not keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
try:
row = portal_db.fetchone(
"SELECT status FROM access_requests WHERE request_code = %s",
(code,),
)
except Exception:
raise HTTPException(status_code=502, detail="failed to load request")
if not row:
raise HTTPException(status_code=404, detail="not found")
status = (row.get("status") or "").strip()
if status not in {"accounts_building", "approved"}:
raise HTTPException(status_code=409, detail="request not retryable")
try:
portal_db.execute(
"UPDATE access_requests SET provision_attempted_at = NULL WHERE request_code = %s",
(code,),
)
portal_db.execute(
"""
UPDATE access_request_tasks
SET status = 'pending',
detail = 'retry requested',
updated_at = NOW()
WHERE request_code = %s AND status = 'error'
""",
(code,),
)
except Exception:
raise HTTPException(status_code=502, detail="failed to update retry state")
threading.Thread(
target=provisioning.provision_access_request,
args=(code,),
daemon=True,
).start()
_record_event(
"access_request_retry",
{
"request_code": code,
"status": "ok",
},
)
return JSONResponse({"ok": True, "request_code": code})
@app.post("/api/account/mailu/rotate")
def rotate_mailu_password(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
_require_account_access(ctx)
if not keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.mailu_rotate"):
started = datetime.now(timezone.utc)
status = "ok"
error_detail = ""
sync_enabled = mailu.ready()
sync_ok = False
sync_error = ""
nextcloud_sync: dict[str, Any] = {"status": "skipped"}
logger.info(
"mailu password rotate requested",
extra={"event": "mailu_rotate", "username": username},
)
try:
password = random_password()
keycloak_admin.set_user_attribute(username, "mailu_app_password", password)
if sync_enabled:
try:
mailu.sync("ariadne_mailu_rotate")
sync_ok = True
except Exception as exc:
sync_error = safe_error_detail(exc, "sync request failed")
try:
nextcloud_sync = nextcloud.sync_mail(username, wait=True)
except Exception as exc:
nextcloud_sync = {"status": "error", "detail": safe_error_detail(exc, "failed to sync nextcloud")}
logger.info(
"mailu password rotate completed",
extra={
"event": "mailu_rotate",
"username": username,
"sync_enabled": sync_enabled,
"sync_ok": sync_ok,
"nextcloud_status": nextcloud_sync.get("status") if isinstance(nextcloud_sync, dict) else "",
},
)
return JSONResponse(
{
"password": password,
"sync_enabled": sync_enabled,
"sync_ok": sync_ok,
"sync_error": sync_error,
"nextcloud_sync": nextcloud_sync,
}
)
except HTTPException as exc:
status = "error"
error_detail = str(exc.detail)
raise
except Exception as exc:
status = "error"
error_detail = safe_error_detail(exc, "mailu rotate failed")
raise HTTPException(status_code=502, detail=error_detail)
finally:
finished = datetime.now(timezone.utc)
duration_sec = (finished - started).total_seconds()
record_task_run("mailu_rotate", status, duration_sec)
try:
storage.record_task_run(
TaskRunRecord(
request_code=None,
task="mailu_rotate",
status=status,
detail=error_detail or None,
started_at=started,
finished_at=finished,
duration_ms=int(duration_sec * 1000),
)
)
except Exception:
pass
_record_event(
"mailu_rotate",
{
"username": username,
"status": status,
"sync_enabled": sync_enabled,
"sync_ok": sync_ok,
"nextcloud_status": nextcloud_sync.get("status") if isinstance(nextcloud_sync, dict) else "",
"error": error_detail,
},
)
@app.post("/api/account/wger/reset")
def reset_wger_password(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
_require_account_access(ctx)
if not keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.wger_reset"):
mailu_email = _resolve_mailu_email(username)
password = random_password()
request = PasswordResetRequest(
task_name="wger_reset",
service_label="wger",
username=username,
mailu_email=mailu_email,
password=password,
sync_fn=lambda: wger.sync_user(username, mailu_email, password, wait=True),
password_attr="wger_password",
updated_attr="wger_password_updated_at",
error_hint="wger sync failed",
)
return _run_password_reset(request)
@app.post("/api/account/firefly/reset")
def reset_firefly_password(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
_require_account_access(ctx)
if not keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.firefly_reset"):
mailu_email = _resolve_mailu_email(username)
password = random_password(24)
request = PasswordResetRequest(
task_name="firefly_reset",
service_label="firefly",
username=username,
mailu_email=mailu_email,
password=password,
sync_fn=lambda: firefly.sync_user(mailu_email, password, wait=True),
password_attr="firefly_password",
updated_attr="firefly_password_updated_at",
error_hint="firefly sync failed",
)
return _run_password_reset(request)
@app.post("/api/account/firefly/rotation/check")
def firefly_rotation_check(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
_require_account_access(ctx)
if not keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.firefly_rotation_check"):
result = firefly.check_rotation_for_user(username)
if result.get("status") == "error":
raise HTTPException(status_code=502, detail=result.get("detail") or "firefly rotation check failed")
return JSONResponse(result)
@app.post("/api/account/wger/rotation/check")
def wger_rotation_check(ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
_require_account_access(ctx)
if not keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.wger_rotation_check"):
result = wger.check_rotation_for_user(username)
if result.get("status") == "error":
raise HTTPException(status_code=502, detail=result.get("detail") or "wger rotation check failed")
return JSONResponse(result)
@app.post("/api/account/nextcloud/mail/sync")
async def nextcloud_mail_sync(request: Request, ctx: AuthContext = Depends(_require_auth)) -> JSONResponse:
_require_account_access(ctx)
if not keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.nextcloud_sync"):
try:
payload = await request.json()
except Exception:
payload = {}
wait = bool(payload.get("wait", True)) if isinstance(payload, dict) else True
started = datetime.now(timezone.utc)
status = "ok"
error_detail = ""
logger.info(
"nextcloud mail sync requested",
extra={"event": "nextcloud_sync", "username": username, "wait": wait},
)
try:
result = nextcloud.sync_mail(username, wait=wait)
logger.info(
"nextcloud mail sync completed",
extra={
"event": "nextcloud_sync",
"username": username,
"status": result.get("status") if isinstance(result, dict) else "",
},
)
return JSONResponse(result)
except HTTPException as exc:
status = "error"
error_detail = str(exc.detail)
raise
except Exception as exc:
status = "error"
error_detail = safe_error_detail(exc, "failed to sync nextcloud mail")
logger.info(
"nextcloud mail sync failed",
extra={"event": "nextcloud_sync", "username": username, "error": error_detail},
)
raise HTTPException(status_code=502, detail=error_detail)
finally:
finished = datetime.now(timezone.utc)
duration_sec = (finished - started).total_seconds()
record_task_run("nextcloud_sync", status, duration_sec)
try:
storage.record_task_run(
TaskRunRecord(
request_code=None,
task="nextcloud_sync",
status=status,
detail=error_detail or None,
started_at=started,
finished_at=finished,
duration_ms=int(duration_sec * 1000),
)
)
except Exception:
pass
_record_event(
"nextcloud_sync",
{
"username": username,
"status": status,
"wait": wait,
"error": error_detail,
},
)
@app.post("/events")
def mailu_event_listener(payload: dict[str, Any] | None = Body(default=None)) -> Response:
"""Accept Mailu webhook events and dispatch mapped account actions."""
status_code, response = mailu_events.handle_event(payload)
return JSONResponse(response, status_code=status_code)
_register_admin_routes(app, _require_auth, _app_module)
_register_account_routes(app, _require_auth, _app_module)

View File

@ -0,0 +1,356 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any, Callable
from fastapi import Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse
from .auth.keycloak import AuthContext
from .db.storage import TaskRunRecord
from .utils.errors import safe_error_detail
from .utils.logging import task_context
@dataclass(frozen=True)
class AccountTaskContext:
task_name: str
username: str
started: datetime
extra: dict[str, Any] | None = None
@dataclass(frozen=True)
class PasswordResetRequest:
task_name: str
service_label: str
username: str
mailu_email: str
password: str
sync_fn: Callable[[], dict[str, Any]]
password_attr: str
updated_attr: str
error_hint: str
def _resolve_mailu_email(module: Any, username: str) -> str:
mailu_email = f"{username}@{module.settings.mailu_domain}"
try:
user = module.keycloak_admin.find_user(username) or {}
attrs = user.get("attributes") if isinstance(user, dict) else None
if isinstance(attrs, dict):
raw_mailu = attrs.get("mailu_email")
if isinstance(raw_mailu, list) and raw_mailu:
return str(raw_mailu[0])
if isinstance(raw_mailu, str) and raw_mailu:
return raw_mailu
except Exception:
return mailu_email
return mailu_email
def _record_account_task(module: Any, ctx: AccountTaskContext, status: str, error_detail: str) -> None:
finished = datetime.now(timezone.utc)
duration_sec = (finished - ctx.started).total_seconds()
module.record_task_run(ctx.task_name, status, duration_sec)
try:
module.storage.record_task_run(
TaskRunRecord(
request_code=None,
task=ctx.task_name,
status=status,
detail=error_detail or None,
started_at=ctx.started,
finished_at=finished,
duration_ms=int(duration_sec * 1000),
)
)
except Exception:
pass
detail = {"username": ctx.username, "status": status, "error": error_detail}
if ctx.extra:
detail.update(ctx.extra)
module._record_event(ctx.task_name, detail)
def _run_password_reset(module: Any, request: PasswordResetRequest) -> JSONResponse:
started = datetime.now(timezone.utc)
task_ctx = AccountTaskContext(
task_name=request.task_name,
username=request.username,
started=started,
extra={"mailu_email": request.mailu_email},
)
status = "ok"
error_detail = ""
module.logger.info(
f"{request.service_label} password reset requested",
extra={"event": request.task_name, "username": request.username},
)
try:
result = request.sync_fn()
status_val = result.get("status") if isinstance(result, dict) else "error"
if status_val != "ok":
raise RuntimeError(f"{request.service_label} sync {status_val}")
module.keycloak_admin.set_user_attribute(request.username, request.password_attr, request.password)
module.keycloak_admin.set_user_attribute(
request.username,
request.updated_attr,
datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
)
module.logger.info(
f"{request.service_label} password reset completed",
extra={"event": request.task_name, "username": request.username},
)
return JSONResponse({"status": "ok", "password": request.password})
except HTTPException as exc:
status = "error"
error_detail = str(exc.detail)
raise
except Exception as exc:
status = "error"
error_detail = safe_error_detail(exc, request.error_hint)
raise HTTPException(status_code=502, detail=error_detail)
finally:
_record_account_task(module, task_ctx, status, error_detail)
def _register_account_routes(app: FastAPI, require_auth: Callable, deps: Callable[[], Any]) -> None: # noqa: PLR0915
@app.post("/api/account/mailu/rotate")
def rotate_mailu_password(ctx: AuthContext = Depends(require_auth)) -> JSONResponse:
"""Rotate the caller's Mailu app password and trigger dependent syncs."""
module = deps()
module._require_account_access(ctx)
if not module.keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.mailu_rotate"):
started = datetime.now(timezone.utc)
status = "ok"
error_detail = ""
sync_enabled = module.mailu.ready()
sync_ok = False
sync_error = ""
nextcloud_sync: dict[str, Any] = {"status": "skipped"}
module.logger.info("mailu password rotate requested", extra={"event": "mailu_rotate", "username": username})
try:
password = module.random_password()
module.keycloak_admin.set_user_attribute(username, "mailu_app_password", password)
if sync_enabled:
try:
module.mailu.sync("ariadne_mailu_rotate")
sync_ok = True
except Exception as exc:
sync_error = safe_error_detail(exc, "sync request failed")
try:
nextcloud_sync = module.nextcloud.sync_mail(username, wait=True)
except Exception as exc:
nextcloud_sync = {"status": "error", "detail": safe_error_detail(exc, "failed to sync nextcloud")}
module.logger.info(
"mailu password rotate completed",
extra={
"event": "mailu_rotate",
"username": username,
"sync_enabled": sync_enabled,
"sync_ok": sync_ok,
"nextcloud_status": nextcloud_sync.get("status") if isinstance(nextcloud_sync, dict) else "",
},
)
return JSONResponse(
{
"password": password,
"sync_enabled": sync_enabled,
"sync_ok": sync_ok,
"sync_error": sync_error,
"nextcloud_sync": nextcloud_sync,
}
)
except HTTPException as exc:
status = "error"
error_detail = str(exc.detail)
raise
except Exception as exc:
status = "error"
error_detail = safe_error_detail(exc, "mailu rotate failed")
raise HTTPException(status_code=502, detail=error_detail)
finally:
task_ctx = AccountTaskContext("mailu_rotate", username, started)
_record_account_task(module, task_ctx, status, error_detail)
module._record_event(
"mailu_rotate",
{
"username": username,
"status": status,
"sync_enabled": sync_enabled,
"sync_ok": sync_ok,
"nextcloud_status": nextcloud_sync.get("status") if isinstance(nextcloud_sync, dict) else "",
"error": error_detail,
},
)
@app.post("/api/account/wger/reset")
def reset_wger_password(ctx: AuthContext = Depends(require_auth)) -> JSONResponse:
"""Reset the caller's Wger password and synchronize the service account."""
module = deps()
module._require_account_access(ctx)
if not module.keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.wger_reset"):
mailu_email = _resolve_mailu_email(module, username)
password = module.random_password()
request = PasswordResetRequest(
task_name="wger_reset",
service_label="wger",
username=username,
mailu_email=mailu_email,
password=password,
sync_fn=lambda: module.wger.sync_user(username, mailu_email, password, wait=True),
password_attr="wger_password",
updated_attr="wger_password_updated_at",
error_hint="wger sync failed",
)
return _run_password_reset(module, request)
@app.post("/api/account/firefly/reset")
def reset_firefly_password(ctx: AuthContext = Depends(require_auth)) -> JSONResponse:
"""Reset the caller's Firefly password and synchronize the service account."""
module = deps()
module._require_account_access(ctx)
if not module.keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.firefly_reset"):
mailu_email = _resolve_mailu_email(module, username)
password = module.random_password(24)
request = PasswordResetRequest(
task_name="firefly_reset",
service_label="firefly",
username=username,
mailu_email=mailu_email,
password=password,
sync_fn=lambda: module.firefly.sync_user(mailu_email, password, wait=True),
password_attr="firefly_password",
updated_attr="firefly_password_updated_at",
error_hint="firefly sync failed",
)
return _run_password_reset(module, request)
@app.post("/api/account/firefly/rotation/check")
def firefly_rotation_check(ctx: AuthContext = Depends(require_auth)) -> JSONResponse:
"""Check whether the caller's Firefly password rotation is healthy."""
module = deps()
module._require_account_access(ctx)
if not module.keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.firefly_rotation_check"):
result = module.firefly.check_rotation_for_user(username)
if result.get("status") == "error":
raise HTTPException(status_code=502, detail=result.get("detail") or "firefly rotation check failed")
return JSONResponse(result)
@app.post("/api/account/wger/rotation/check")
def wger_rotation_check(ctx: AuthContext = Depends(require_auth)) -> JSONResponse:
"""Check whether the caller's Wger password rotation is healthy."""
module = deps()
module._require_account_access(ctx)
if not module.keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.wger_rotation_check"):
result = module.wger.check_rotation_for_user(username)
if result.get("status") == "error":
raise HTTPException(status_code=502, detail=result.get("detail") or "wger rotation check failed")
return JSONResponse(result)
@app.post("/api/account/nextcloud/mail/sync")
async def nextcloud_mail_sync(request: Request, ctx: AuthContext = Depends(require_auth)) -> JSONResponse:
"""Synchronize the caller's Mailu address into Nextcloud mail settings."""
module = deps()
module._require_account_access(ctx)
if not module.keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
username = ctx.username or ""
if not username:
raise HTTPException(status_code=400, detail="missing username")
with task_context("account.nextcloud_sync"):
try:
payload = await request.json()
except Exception:
payload = {}
wait = bool(payload.get("wait", True)) if isinstance(payload, dict) else True
started = datetime.now(timezone.utc)
status = "ok"
error_detail = ""
module.logger.info("nextcloud mail sync requested", extra={"event": "nextcloud_sync", "username": username, "wait": wait})
try:
result = module.nextcloud.sync_mail(username, wait=wait)
module.logger.info(
"nextcloud mail sync completed",
extra={
"event": "nextcloud_sync",
"username": username,
"status": result.get("status") if isinstance(result, dict) else "",
},
)
return JSONResponse(result)
except HTTPException as exc:
status = "error"
error_detail = str(exc.detail)
raise
except Exception as exc:
status = "error"
error_detail = safe_error_detail(exc, "failed to sync nextcloud mail")
module.logger.info(
"nextcloud mail sync failed",
extra={"event": "nextcloud_sync", "username": username, "error": error_detail},
)
raise HTTPException(status_code=502, detail=error_detail)
finally:
task_ctx = AccountTaskContext("nextcloud_sync", username, started)
_record_account_task(module, task_ctx, status, error_detail)
module._record_event(
"nextcloud_sync",
{
"username": username,
"status": status,
"wait": wait,
"error": error_detail,
},
)

346
ariadne/app_admin_routes.py Normal file
View File

@ -0,0 +1,346 @@
from __future__ import annotations
from datetime import datetime
import threading
from typing import Any, Callable
from fastapi import Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse
from .auth.keycloak import AuthContext
from .utils.logging import task_context
def _register_admin_routes(app: FastAPI, require_auth: Callable, deps: Callable[[], Any]) -> None: # noqa: PLR0915
@app.get("/api/admin/access/requests")
def list_access_requests(ctx: AuthContext = Depends(require_auth)) -> JSONResponse:
"""Return pending access requests for authenticated administrators."""
module = deps()
module._require_admin(ctx)
module.logger.info(
"list access requests",
extra={"event": "access_requests_list", "actor": ctx.username or ""},
)
try:
rows = module.storage.list_pending_requests()
except Exception:
raise HTTPException(status_code=502, detail="failed to load requests")
output: list[dict[str, Any]] = []
for row in rows:
created_at = row.get("created_at")
output.append(
{
"id": row.get("request_code"),
"username": row.get("username"),
"email": row.get("contact_email") or "",
"first_name": row.get("first_name") or "",
"last_name": row.get("last_name") or "",
"request_code": row.get("request_code"),
"created_at": created_at.isoformat() if isinstance(created_at, datetime) else "",
"note": row.get("note") or "",
}
)
return JSONResponse({"requests": output})
@app.get("/api/admin/access/flags")
def list_access_flags(ctx: AuthContext = Depends(require_auth)) -> JSONResponse:
"""Return Keycloak groups that can be applied as access-request flags."""
module = deps()
module._require_admin(ctx)
flags = module.settings.allowed_flag_groups
if module.keycloak_admin.ready():
try:
flags = module.keycloak_admin.list_group_names(exclude={"admin"})
except Exception:
flags = module.settings.allowed_flag_groups
return JSONResponse({"flags": flags})
@app.get("/api/admin/audit/events")
def list_audit_events(
limit: int = 200,
event_type: str | None = None,
ctx: AuthContext = Depends(require_auth),
) -> JSONResponse:
"""Return recent audit events with optional type filtering."""
module = deps()
module._require_admin(ctx)
try:
rows = module.storage.list_events(limit=limit, event_type=event_type)
except Exception:
raise HTTPException(status_code=502, detail="failed to load audit events")
output: list[dict[str, Any]] = []
for row in rows:
created_at = row.get("created_at")
output.append(
{
"id": row.get("id"),
"event_type": row.get("event_type"),
"detail": module._parse_event_detail(row.get("detail")),
"created_at": created_at.isoformat() if isinstance(created_at, datetime) else "",
}
)
return JSONResponse({"events": output})
@app.get("/api/admin/audit/task-runs")
def list_audit_task_runs(
limit: int = 200,
request_code: str | None = None,
task: str | None = None,
ctx: AuthContext = Depends(require_auth),
) -> JSONResponse:
"""Return recorded background task runs for admin audit views."""
module = deps()
module._require_admin(ctx)
try:
rows = module.storage.list_task_runs(limit=limit, request_code=request_code, task=task)
except Exception:
raise HTTPException(status_code=502, detail="failed to load task runs")
output: list[dict[str, Any]] = []
for row in rows:
started_at = row.get("started_at")
finished_at = row.get("finished_at")
output.append(
{
"id": row.get("id"),
"request_code": row.get("request_code") or "",
"task": row.get("task") or "",
"status": row.get("status") or "",
"detail": module._parse_event_detail(row.get("detail")),
"started_at": started_at.isoformat() if isinstance(started_at, datetime) else "",
"finished_at": finished_at.isoformat() if isinstance(finished_at, datetime) else "",
"duration_ms": row.get("duration_ms"),
}
)
return JSONResponse({"task_runs": output})
@app.get("/api/admin/cluster/state")
def get_cluster_state(ctx: AuthContext = Depends(require_auth)) -> JSONResponse:
"""Return the latest cluster-state snapshot to authenticated administrators."""
module = deps()
module._require_admin(ctx)
snapshot = module.storage.latest_cluster_state()
if not snapshot:
raise HTTPException(status_code=404, detail="cluster state unavailable")
return JSONResponse(snapshot)
@app.get("/api/internal/cluster/state")
def get_cluster_state_internal() -> JSONResponse:
"""Return the latest cluster-state snapshot for trusted internal callers."""
module = deps()
snapshot = module.storage.latest_cluster_state()
if not snapshot:
raise HTTPException(status_code=404, detail="cluster state unavailable")
return JSONResponse(snapshot)
@app.post("/api/admin/access/requests/{username}/approve")
async def approve_access_request(
username: str,
request: Request,
ctx: AuthContext = Depends(require_auth),
) -> JSONResponse:
"""Approve a verified access request and start account provisioning."""
module = deps()
module._require_admin(ctx)
with task_context("admin.access.approve"):
payload = await module._read_json_payload(request)
allowed_flags = module._allowed_flag_groups()
flags = [flag for flag in module._flags_from_payload(payload) if flag in allowed_flags]
note = module._note_from_payload(payload)
decided_by = ctx.username or ""
try:
row = module.portal_db.fetchone(
"""
UPDATE access_requests
SET status = 'approved',
decided_at = NOW(),
decided_by = %s,
approval_flags = %s,
approval_note = %s
WHERE username = %s
AND status = 'pending'
AND email_verified_at IS NOT NULL
RETURNING request_code
""",
(decided_by or None, flags or None, note, username),
)
except Exception:
raise HTTPException(status_code=502, detail="failed to approve request")
if not row:
module.logger.info(
"access request approval ignored",
extra={"event": "access_request_approve", "actor": decided_by, "username": username, "status": "skipped"},
)
module._record_event(
"access_request_approve",
{
"actor": decided_by,
"username": username,
"status": "skipped",
},
)
return JSONResponse({"ok": True, "request_code": ""})
request_code = row.get("request_code") or ""
if request_code:
threading.Thread(
target=module.provisioning.provision_access_request,
args=(request_code,),
daemon=True,
).start()
module.logger.info(
"access request approved",
extra={
"event": "access_request_approve",
"actor": decided_by,
"username": username,
"request_code": request_code,
},
)
module._record_event(
"access_request_approve",
{
"actor": decided_by,
"username": username,
"request_code": request_code,
"status": "ok",
"flags": flags,
"note": note or "",
},
)
return JSONResponse({"ok": True, "request_code": request_code})
@app.post("/api/admin/access/requests/{username}/deny")
async def deny_access_request(
username: str,
request: Request,
ctx: AuthContext = Depends(require_auth),
) -> JSONResponse:
"""Deny a pending access request and record the administrator decision."""
module = deps()
module._require_admin(ctx)
with task_context("admin.access.deny"):
payload = await module._read_json_payload(request)
note = module._note_from_payload(payload)
decided_by = ctx.username or ""
try:
row = module.portal_db.fetchone(
"""
UPDATE access_requests
SET status = 'denied',
decided_at = NOW(),
decided_by = %s,
denial_note = %s
WHERE username = %s AND status = 'pending'
RETURNING request_code
""",
(decided_by or None, note, username),
)
except Exception:
raise HTTPException(status_code=502, detail="failed to deny request")
if not row:
module.logger.info(
"access request denial ignored",
extra={"event": "access_request_deny", "actor": decided_by, "username": username, "status": "skipped"},
)
module._record_event(
"access_request_deny",
{
"actor": decided_by,
"username": username,
"status": "skipped",
},
)
return JSONResponse({"ok": True, "request_code": ""})
module.logger.info(
"access request denied",
extra={
"event": "access_request_deny",
"actor": decided_by,
"username": username,
"request_code": row.get("request_code") or "",
},
)
module._record_event(
"access_request_deny",
{
"actor": decided_by,
"username": username,
"request_code": row.get("request_code") or "",
"status": "ok",
"note": note or "",
},
)
return JSONResponse({"ok": True, "request_code": row.get("request_code")})
@app.post("/api/access/requests/{request_code}/retry")
def retry_access_request(request_code: str) -> JSONResponse:
"""Reset failed provisioning tasks so an approved request can retry."""
module = deps()
code = (request_code or "").strip()
if not code:
raise HTTPException(status_code=400, detail="request_code is required")
if not module.keycloak_admin.ready():
raise HTTPException(status_code=503, detail="server not configured")
try:
row = module.portal_db.fetchone(
"SELECT status FROM access_requests WHERE request_code = %s",
(code,),
)
except Exception:
raise HTTPException(status_code=502, detail="failed to load request")
if not row:
raise HTTPException(status_code=404, detail="not found")
status = (row.get("status") or "").strip()
if status not in {"accounts_building", "approved"}:
raise HTTPException(status_code=409, detail="request not retryable")
try:
module.portal_db.execute(
"UPDATE access_requests SET provision_attempted_at = NULL WHERE request_code = %s",
(code,),
)
module.portal_db.execute(
"""
UPDATE access_request_tasks
SET status = 'pending',
detail = 'retry requested',
updated_at = NOW()
WHERE request_code = %s AND status = 'error'
""",
(code,),
)
except Exception:
raise HTTPException(status_code=502, detail="failed to update retry state")
threading.Thread(
target=module.provisioning.provision_access_request,
args=(code,),
daemon=True,
).start()
module._record_event(
"access_request_retry",
{
"request_code": code,
"status": "ok",
},
)
return JSONResponse({"ok": True, "request_code": code})

View File

@ -19,6 +19,8 @@ class AuthContext:
class KeycloakOIDC:
"""Validate Keycloak-issued OIDC tokens and return trusted claims."""
def __init__(self, jwks_url: str, issuer: str, client_id: str) -> None:
self._jwks_url = jwks_url
self._issuer = issuer
@ -55,12 +57,18 @@ class KeycloakOIDC:
def _decode_claims(self, token: str, key: dict[str, Any]) -> dict[str, Any]:
return jwt.decode(
token,
key=jwt.algorithms.RSAAlgorithm.from_jwk(key),
key=self._key_from_jwk(key),
algorithms=["RS256"],
options={"verify_aud": False},
issuer=self._issuer,
)
def _key_from_jwk(self, key: dict[str, Any]) -> Any:
algorithm = getattr(jwt.algorithms, "RSAAlgorithm", None)
if algorithm and hasattr(algorithm, "from_jwk"):
return algorithm.from_jwk(key)
return jwt.PyJWK.from_dict(key).key
def _validate_audience(self, claims: dict[str, Any]) -> None:
azp = claims.get("azp")
aud = claims.get("aud")
@ -97,6 +105,8 @@ class KeycloakOIDC:
class Authenticator:
"""Translate bearer tokens into Ariadne authorization context."""
def __init__(self) -> None:
self._oidc = KeycloakOIDC(settings.keycloak_jwks_url, settings.keycloak_issuer, settings.keycloak_client_id)

View File

@ -25,6 +25,8 @@ class DatabaseConfig:
class Database:
"""Small Postgres wrapper with migration and query helpers."""
def __init__(self, dsn: str, config: DatabaseConfig | None = None) -> None:
if not dsn:
raise RuntimeError("database URL is required")
@ -92,13 +94,7 @@ class Database:
except Exception:
pass
def migrate(
self,
lock_id: int,
*,
include_ariadne_tables: bool = True,
include_access_requests: bool = True,
) -> None:
def migrate(self, lock_id: int, *, include_ariadne_tables: bool = True, include_access_requests: bool = True) -> None:
with self.connection() as conn:
self._configure_timeouts(conn)
if not self._try_advisory_lock(conn, lock_id):

View File

@ -62,6 +62,8 @@ class ScheduleState:
class Storage:
"""Persist Ariadne access requests, task state, and audit data."""
def __init__(self, db: Database, portal_db: Database | None = None) -> None:
self._db = db
self._portal_db = portal_db or db
@ -262,6 +264,36 @@ class Storage:
),
)
def list_schedule_states(self) -> list[ScheduleState]:
"""Return persisted scheduler state so metrics survive process restarts."""
rows = self._db.fetchall(
"""
SELECT task_name, cron_expr, last_started_at, last_finished_at, last_status,
last_error, last_duration_ms, next_run_at
FROM ariadne_schedule_state
"""
)
states: list[ScheduleState] = []
for row in rows:
task_name = row.get("task_name")
cron_expr = row.get("cron_expr")
if not isinstance(task_name, str) or not isinstance(cron_expr, str):
continue
states.append(
ScheduleState(
task_name=task_name,
cron_expr=cron_expr,
last_started_at=row.get("last_started_at"),
last_finished_at=row.get("last_finished_at"),
last_status=row.get("last_status"),
last_error=row.get("last_error"),
last_duration_ms=row.get("last_duration_ms"),
next_run_at=row.get("next_run_at"),
)
)
return states
def record_cluster_state(self, snapshot: dict[str, Any]) -> None:
payload = json.dumps(snapshot, ensure_ascii=True)
self._db.execute(

View File

@ -35,6 +35,8 @@ def _k8s_request(method: str, path: str, payload: dict[str, Any] | None = None)
def get_json(path: str) -> dict[str, Any]:
"""Fetch a Kubernetes API path and return its JSON object payload."""
payload = _k8s_request("GET", path)
if not isinstance(payload, dict):
raise RuntimeError("unexpected kubernetes response")
@ -42,6 +44,8 @@ def get_json(path: str) -> dict[str, Any]:
def post_json(path: str, payload: dict[str, Any]) -> dict[str, Any]:
"""Post a JSON payload to the Kubernetes API and return the response."""
data = _k8s_request("POST", path, payload)
if not isinstance(data, dict):
raise RuntimeError("unexpected kubernetes response")
@ -49,6 +53,8 @@ def post_json(path: str, payload: dict[str, Any]) -> dict[str, Any]:
def delete_json(path: str) -> dict[str, Any]:
"""Delete a Kubernetes API resource and return the response payload."""
data = _k8s_request("DELETE", path)
if not isinstance(data, dict):
raise RuntimeError("unexpected kubernetes response")
@ -56,6 +62,8 @@ def delete_json(path: str) -> dict[str, Any]:
def get_secret_value(namespace: str, name: str, key: str) -> str:
"""Read and decode one string value from a Kubernetes Secret."""
data = get_json(f"/api/v1/namespaces/{namespace}/secrets/{name}")
blob = data.get("data") if isinstance(data.get("data"), dict) else {}
raw = blob.get(key)

View File

@ -9,10 +9,7 @@ try:
from kubernetes import client, config
from kubernetes.stream import stream
except Exception as exc: # pragma: no cover - import checked at runtime
client = None
config = None
stream = None
_IMPORT_ERROR = exc
client, config, stream, _IMPORT_ERROR = None, None, None, exc
else:
_IMPORT_ERROR = None
@ -65,18 +62,14 @@ def _build_command(command: list[str] | str, env: dict[str, str] | None) -> list
class PodExecutor:
"""Run shell commands inside the freshest ready pod matching a selector."""
def __init__(self, namespace: str, label_selector: str, container: str | None = None) -> None:
self._namespace = namespace
self._label_selector = label_selector
self._container = container
def exec(
self,
command: list[str] | str,
env: dict[str, str] | None = None,
timeout_sec: float | None = None,
check: bool = True,
) -> ExecResult:
def exec(self, command: list[str] | str, env: dict[str, str] | None = None, timeout_sec: float | None = None, check: bool = True) -> ExecResult:
pod = select_pod(self._namespace, self._label_selector)
cmd = _build_command(command, env)
api = _ensure_client()

View File

@ -47,6 +47,8 @@ def _is_ready(pod: dict[str, Any]) -> bool:
def list_pods(namespace: str, label_selector: str) -> list[dict[str, Any]]:
"""List Kubernetes pods for a namespace and label selector."""
namespace = (namespace or "").strip()
if not namespace:
raise PodSelectionError("pod namespace missing")
@ -58,6 +60,8 @@ def list_pods(namespace: str, label_selector: str) -> list[dict[str, Any]]:
def select_pod(namespace: str, label_selector: str) -> PodRef:
"""Select the newest ready pod matching a namespace and label selector."""
pods = list_pods(namespace, label_selector)
candidates: list[tuple[float, PodRef]] = []
for pod in pods:

View File

@ -1,15 +1,11 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
import hashlib
import re
from datetime import datetime
import threading
import time
from typing import Any
from ..db.database import Database
from ..db.storage import REQUIRED_TASKS, Storage, TaskRunRecord
from ..db.storage import REQUIRED_TASKS, Storage
from ..metrics.metrics import record_task_run, set_access_request_counts
from ..services.firefly import firefly
from ..services.keycloak_admin import keycloak_admin
@ -19,87 +15,80 @@ from ..services.nextcloud import nextcloud
from ..services.vaultwarden import vaultwarden
from ..services.wger import wger
from ..settings import settings
from ..utils.errors import safe_error_detail
from ..utils.logging import get_logger
from ..utils.passwords import random_password
MAILU_EMAIL_ATTR = "mailu_email"
MAILU_APP_PASSWORD_ATTR = "mailu_app_password"
MAILU_ENABLED_ATTR = "mailu_enabled"
WGER_PASSWORD_ATTR = "wger_password"
WGER_PASSWORD_UPDATED_ATTR = "wger_password_updated_at"
FIREFLY_PASSWORD_ATTR = "firefly_password"
FIREFLY_PASSWORD_UPDATED_ATTR = "firefly_password_updated_at"
VAULTWARDEN_GRANDFATHERED_FLAG = "vaultwarden_grandfathered"
_RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
_RETRYABLE_TOKENS = (
"timeout",
"temporar",
"rate limited",
"mailbox not ready",
"connection refused",
"connection reset",
"network is unreachable",
"dns",
"name resolution",
"service unavailable",
"bad gateway",
"gateway timeout",
from .provisioning_accounts import _ProvisioningAccountsMixin
from .provisioning_protocol import (
FIREFLY_PASSWORD_ATTR,
FIREFLY_PASSWORD_UPDATED_ATTR,
MAILU_APP_PASSWORD_ATTR,
MAILU_EMAIL_ATTR,
MAILU_ENABLED_ATTR,
VAULTWARDEN_GRANDFATHERED_FLAG,
WGER_PASSWORD_ATTR,
WGER_PASSWORD_UPDATED_ATTR,
ProvisionOutcome,
RequestContext,
_advisory_lock_id,
_extract_attr,
)
from .provisioning_tasks import _ProvisioningTaskMixin
logger = get_logger(__name__)
@dataclass(frozen=True)
class ProvisionOutcome:
ok: bool
status: str
class ProvisioningManager(_ProvisioningTaskMixin, _ProvisioningAccountsMixin):
"""Coordinate approved access requests across identity and app services."""
@dataclass
class RequestContext:
request_code: str
username: str
first_name: str
last_name: str
contact_email: str
email_verified_at: datetime | None
status: str
initial_password: str | None
revealed_at: datetime | None
attempted_at: datetime | None
approval_flags: list[str]
user_id: str = ""
mailu_email: str = ""
def _advisory_lock_id(request_code: str) -> int:
digest = hashlib.sha256(request_code.encode("utf-8")).digest()
return int.from_bytes(digest[:8], "big", signed=True)
def _extract_attr(attrs: Any, key: str) -> str:
if not isinstance(attrs, dict):
return ""
raw = attrs.get(key)
if isinstance(raw, list):
for item in raw:
if isinstance(item, str) and item.strip():
return item.strip()
return ""
if isinstance(raw, str) and raw.strip():
return raw.strip()
return ""
class ProvisioningManager:
def __init__(self, db: Database, storage: Storage) -> None:
self._db = db
self._storage = storage
self._thread: threading.Thread | None = None
self._stop_event = threading.Event()
@property
def _settings(self):
return settings
@property
def _logger(self):
return logger
@property
def _keycloak_admin(self):
return keycloak_admin
@property
def _mailu(self):
return mailu
@property
def _nextcloud(self):
return nextcloud
@property
def _wger(self):
return wger
@property
def _firefly(self):
return firefly
@property
def _vaultwarden(self):
return vaultwarden
@property
def _mailer(self):
return mailer
def _random_password(self, length: int = 32) -> str:
return random_password(length)
def _record_task_run_metric(self, task: str, status: str, duration_sec: float) -> None:
record_task_run(task, status, duration_sec)
def start(self) -> None:
if self._thread and self._thread.is_alive():
return
@ -207,12 +196,7 @@ class ProvisioningManager:
extra={"event": "provision_unlock_error", "request_code": request_code},
)
def _provision_locked(
self,
conn,
request_code: str,
required_tasks: list[str],
) -> ProvisionOutcome:
def _provision_locked(self, conn, request_code: str, required_tasks: list[str]) -> ProvisionOutcome:
ctx = self._load_request(conn, request_code)
if not ctx:
return ProvisionOutcome(ok=False, status="unknown")
@ -227,12 +211,7 @@ class ProvisioningManager:
return self._run_task_pipeline(conn, ctx, required_tasks)
def _run_task_pipeline(
self,
conn,
ctx: RequestContext,
required_tasks: list[str],
) -> ProvisionOutcome:
def _run_task_pipeline(self, conn, ctx: RequestContext, required_tasks: list[str]) -> ProvisionOutcome:
if not self._ensure_keycloak_user(conn, ctx):
return ProvisionOutcome(ok=False, status="accounts_building")
if not self._run_account_tasks(conn, ctx):
@ -353,581 +332,19 @@ class ProvisioningManager:
pass
return ProvisionOutcome(ok=False, status=pending_status)
def _ensure_task_rows(self, conn, request_code: str, tasks: list[str]) -> None:
if not tasks:
return
conn.execute(
"""
INSERT INTO access_request_tasks (request_code, task, status, detail, updated_at)
SELECT %s, task, 'pending', NULL, NOW()
FROM UNNEST(%s::text[]) AS task
ON CONFLICT (request_code, task) DO NOTHING
""",
(request_code, tasks),
)
def _upsert_task(self, conn, request_code: str, task: str, status: str, detail: str | None = None) -> None:
conn.execute(
"""
INSERT INTO access_request_tasks (request_code, task, status, detail, updated_at)
VALUES (%s, %s, %s, %s, NOW())
ON CONFLICT (request_code, task)
DO UPDATE SET status = EXCLUDED.status, detail = EXCLUDED.detail, updated_at = NOW()
""",
(request_code, task, status, detail),
)
def _task_statuses(self, conn, request_code: str) -> dict[str, str]:
rows = conn.execute(
"SELECT task, status FROM access_request_tasks WHERE request_code = %s",
(request_code,),
).fetchall()
output: dict[str, str] = {}
for row in rows:
task = row.get("task") if isinstance(row, dict) else None
status = row.get("status") if isinstance(row, dict) else None
if isinstance(task, str) and isinstance(status, str):
output[task] = status
return output
def _all_tasks_ok(self, conn, request_code: str, tasks: list[str]) -> bool:
statuses = self._task_statuses(conn, request_code)
for task in tasks:
if statuses.get(task) != "ok":
return False
return True
def _record_task(self, request_code: str, task: str, status: str, detail: str | None, started: datetime) -> None:
finished = datetime.now(timezone.utc)
duration_sec = (finished - started).total_seconds()
record_task_run(task, status, duration_sec)
logger.info(
"task run",
extra={
"event": "task_run",
"request_code": request_code,
"task": task,
"status": status,
"duration_sec": round(duration_sec, 3),
"detail": detail or "",
},
)
try:
self._storage.record_event(
"provision_task",
{
"request_code": request_code,
"task": task,
"status": status,
"duration_sec": round(duration_sec, 3),
"detail": detail or "",
},
)
except Exception:
pass
try:
self._storage.record_task_run(
TaskRunRecord(
request_code=request_code,
task=task,
status=status,
detail=detail,
started_at=started,
finished_at=finished,
duration_ms=int(duration_sec * 1000),
)
)
except Exception:
pass
def _task_ok(
self,
conn,
request_code: str,
task: str,
detail: str | None,
started: datetime,
) -> None:
self._upsert_task(conn, request_code, task, "ok", detail)
self._record_task(request_code, task, "ok", detail, started)
def _task_error(
self,
conn,
request_code: str,
task: str,
detail: str,
started: datetime,
) -> None:
self._upsert_task(conn, request_code, task, "error", detail)
self._record_task(request_code, task, "error", detail, started)
def _task_pending(
self,
conn,
request_code: str,
task: str,
detail: str,
started: datetime,
) -> None:
self._upsert_task(conn, request_code, task, "pending", detail)
self._record_task(request_code, task, "pending", detail, started)
def _is_retryable_detail(self, detail: str) -> bool:
if not detail:
return False
detail_lower = detail.lower()
match = re.match(r"^http\s+(\d{3})", detail_lower)
if match:
try:
code = int(match.group(1))
except ValueError:
code = 0
if code in _RETRYABLE_HTTP_CODES:
return True
return any(token in detail_lower for token in _RETRYABLE_TOKENS)
def _retryable_detail(self, detail: str) -> str:
cleaned = detail.strip() if isinstance(detail, str) else ""
if not cleaned:
return "retryable: temporary failure"
return f"retryable: {cleaned}"
def _task_fail(
self,
conn,
request_code: str,
task: str,
detail: str,
started: datetime,
) -> None:
detail_lower = detail.lower()
if "missing verified email address" in detail_lower or "email not verified" in detail_lower:
self._task_pending(conn, request_code, task, "blocked: email not verified", started)
return
if self._is_retryable_detail(detail):
self._task_pending(conn, request_code, task, self._retryable_detail(detail), started)
return
self._task_error(conn, request_code, task, detail, started)
def _vaultwarden_rate_limit_detail(self) -> tuple[str, datetime]:
retry_at = datetime.now(timezone.utc) + timedelta(
seconds=float(settings.vaultwarden_admin_rate_limit_backoff_sec)
)
retry_iso = retry_at.strftime("%Y-%m-%dT%H:%M:%SZ")
return f"rate limited until {retry_iso}", retry_at
@staticmethod
def _parse_retry_at(detail: str) -> datetime | None:
prefix = "rate limited until "
if not isinstance(detail, str) or not detail.startswith(prefix):
return None
ts = detail[len(prefix) :].strip()
for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"):
try:
parsed = datetime.strptime(ts, fmt)
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=timezone.utc)
return parsed
except ValueError:
continue
return None
def _vaultwarden_retry_due(self, conn, request_code: str) -> bool:
row = conn.execute(
"""
SELECT status, detail
FROM access_request_tasks
WHERE request_code = %s AND task = 'vaultwarden_invite'
""",
(request_code,),
).fetchone()
if not isinstance(row, dict):
return True
if row.get("status") != "pending":
return True
retry_at = self._parse_retry_at(row.get("detail") or "")
if not retry_at:
return True
return datetime.now(timezone.utc) >= retry_at
@staticmethod
def _set_vaultwarden_attrs(username: str, email: str, status: str) -> None:
if not username or not email or not status:
return
try:
now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
keycloak_admin.set_user_attribute(username, "vaultwarden_email", email)
keycloak_admin.set_user_attribute(username, "vaultwarden_status", status)
keycloak_admin.set_user_attribute(username, "vaultwarden_synced_at", now_iso)
except Exception:
return
def _ready_for_retry(self, ctx: RequestContext) -> bool:
if ctx.status != "accounts_building":
return True
attempted_at = ctx.attempted_at
if not isinstance(attempted_at, datetime):
return True
if attempted_at.tzinfo is None:
attempted_at = attempted_at.replace(tzinfo=timezone.utc)
age_sec = (datetime.now(timezone.utc) - attempted_at).total_seconds()
return age_sec >= settings.provision_retry_cooldown_sec
def _require_verified_email(self, ctx: RequestContext) -> str:
if not isinstance(ctx.email_verified_at, datetime):
raise RuntimeError("missing verified email address")
email = ctx.contact_email.strip()
if not email:
raise RuntimeError("missing verified email address")
return email
def _ensure_email_unused(self, email: str, username: str) -> None:
existing_email_user = keycloak_admin.find_user_by_email(email)
if existing_email_user and (existing_email_user.get("username") or "") != username:
raise RuntimeError("email is already associated with an existing Atlas account")
def _new_user_payload(
self,
username: str,
email: str,
mailu_email: str,
first_name: str,
last_name: str,
) -> dict[str, Any]:
payload = {
"username": username,
"enabled": True,
"email": email,
"emailVerified": True,
"requiredActions": [],
"attributes": {
MAILU_EMAIL_ATTR: [mailu_email],
MAILU_ENABLED_ATTR: ["true"],
},
}
if first_name:
payload["firstName"] = first_name
if last_name:
payload["lastName"] = last_name
else:
payload["lastName"] = username
return payload
def _create_or_fetch_user(self, ctx: RequestContext) -> dict[str, Any]:
user = keycloak_admin.find_user(ctx.username)
if user:
return user
email = self._require_verified_email(ctx)
self._ensure_email_unused(email, ctx.username)
payload = self._new_user_payload(ctx.username, email, ctx.mailu_email, ctx.first_name, ctx.last_name)
try:
created_id = keycloak_admin.create_user(payload)
return keycloak_admin.get_user(created_id)
except Exception as exc:
detail = safe_error_detail(exc, "create user failed")
logger.warning(
"keycloak create user failed, checking for existing user",
extra={"event": "keycloak_user_fallback", "username": ctx.username, "detail": detail},
)
user = keycloak_admin.find_user(ctx.username)
if user:
return user
user = keycloak_admin.find_user_by_email(email)
if user:
return user
raise
def _fetch_full_user(self, user_id: str, fallback: dict[str, Any]) -> dict[str, Any]:
try:
return keycloak_admin.get_user(user_id)
except Exception:
return fallback
def _strip_totp_action(self, user_id: str, full_user: dict[str, Any]) -> None:
actions = full_user.get("requiredActions")
if not isinstance(actions, list) or "CONFIGURE_TOTP" not in actions:
return
new_actions = [action for action in actions if action != "CONFIGURE_TOTP"]
keycloak_admin.update_user_safe(user_id, {"requiredActions": new_actions})
def _ensure_contact_email(self, ctx: RequestContext, full_user: dict[str, Any]) -> None:
email_value = full_user.get("email")
if isinstance(email_value, str) and email_value.strip():
return
if isinstance(ctx.email_verified_at, datetime) and ctx.contact_email.strip():
keycloak_admin.update_user_safe(
ctx.user_id,
{"email": ctx.contact_email.strip(), "emailVerified": True},
)
def _ensure_mailu_attrs(self, ctx: RequestContext, full_user: dict[str, Any]) -> None:
attrs = full_user.get("attributes") or {}
if not isinstance(attrs, dict):
return
existing = _extract_attr(attrs, MAILU_EMAIL_ATTR)
if existing:
ctx.mailu_email = existing
else:
ctx.mailu_email = f"{ctx.username}@{settings.mailu_domain}"
keycloak_admin.set_user_attribute(ctx.username, MAILU_EMAIL_ATTR, ctx.mailu_email)
enabled_value = _extract_attr(attrs, MAILU_ENABLED_ATTR)
if enabled_value.lower() not in {"1", "true", "yes", "y", "on"}:
keycloak_admin.set_user_attribute(ctx.username, MAILU_ENABLED_ATTR, "true")
def _sync_user_profile(self, ctx: RequestContext, user: dict[str, Any]) -> None:
try:
full_user = self._fetch_full_user(ctx.user_id, user)
self._strip_totp_action(ctx.user_id, full_user)
self._ensure_contact_email(ctx, full_user)
self._ensure_mailu_attrs(ctx, full_user)
except Exception:
ctx.mailu_email = f"{ctx.username}@{settings.mailu_domain}"
def _ensure_keycloak_user(self, conn, ctx: RequestContext) -> bool:
start = datetime.now(timezone.utc)
try:
user = self._create_or_fetch_user(ctx)
ctx.user_id = str((user or {}).get("id") or "")
if not ctx.user_id:
raise RuntimeError("user id missing")
self._sync_user_profile(ctx, user)
self._task_ok(conn, ctx.request_code, "keycloak_user", None, start)
return True
except Exception as exc:
detail = safe_error_detail(exc, "failed to ensure user")
self._task_fail(conn, ctx.request_code, "keycloak_user", detail, start)
return False
def _ensure_keycloak_password(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
should_reset = ctx.status == "accounts_building" and ctx.revealed_at is None
password_value: str | None = None
if should_reset:
if isinstance(ctx.initial_password, str) and ctx.initial_password:
password_value = ctx.initial_password
elif ctx.initial_password is None:
password_value = random_password(20)
conn.execute(
"""
UPDATE access_requests
SET initial_password = %s
WHERE request_code = %s AND initial_password IS NULL
""",
(password_value, ctx.request_code),
)
ctx.initial_password = password_value
if password_value:
keycloak_admin.reset_password(ctx.user_id, password_value, temporary=False)
if isinstance(ctx.initial_password, str) and ctx.initial_password:
self._task_ok(conn, ctx.request_code, "keycloak_password", None, start)
elif ctx.revealed_at is not None:
detail = "initial password already revealed"
self._task_ok(conn, ctx.request_code, "keycloak_password", detail, start)
else:
raise RuntimeError("initial password missing")
except Exception as exc:
detail = safe_error_detail(exc, "failed to set password")
self._task_fail(conn, ctx.request_code, "keycloak_password", detail, start)
def _ensure_keycloak_groups(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
approved_flags = [flag for flag in ctx.approval_flags if flag in settings.allowed_flag_groups]
groups = list(dict.fromkeys(settings.default_user_groups + approved_flags))
for group_name in groups:
gid = keycloak_admin.get_group_id(group_name)
if not gid:
raise RuntimeError(f"group missing: {group_name}")
keycloak_admin.add_user_to_group(ctx.user_id, gid)
self._task_ok(conn, ctx.request_code, "keycloak_groups", None, start)
except Exception as exc:
detail = safe_error_detail(exc, "failed to add groups")
self._task_fail(conn, ctx.request_code, "keycloak_groups", detail, start)
def _ensure_mailu_app_password(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
full = keycloak_admin.get_user(ctx.user_id)
attrs = full.get("attributes") or {}
existing = _extract_attr(attrs, MAILU_APP_PASSWORD_ATTR)
if not existing:
keycloak_admin.set_user_attribute(ctx.username, MAILU_APP_PASSWORD_ATTR, random_password())
self._task_ok(conn, ctx.request_code, "mailu_app_password", None, start)
except Exception as exc:
detail = safe_error_detail(exc, "failed to set mail password")
self._task_fail(conn, ctx.request_code, "mailu_app_password", detail, start)
def _sync_mailu(self, conn, ctx: RequestContext) -> bool:
start = datetime.now(timezone.utc)
try:
if not mailu.ready():
detail = "mailu not configured"
self._task_ok(conn, ctx.request_code, "mailu_sync", detail, start)
return True
mailu.sync(reason="ariadne_access_approve", force=True)
mailbox_ready = mailu.wait_for_mailbox(
ctx.mailu_email,
settings.mailu_mailbox_wait_timeout_sec,
)
if not mailbox_ready:
raise RuntimeError("mailbox not ready")
self._task_ok(conn, ctx.request_code, "mailu_sync", None, start)
return True
except Exception as exc:
detail = safe_error_detail(exc, "failed to sync mailu")
self._task_fail(conn, ctx.request_code, "mailu_sync", detail, start)
return False
def _sync_nextcloud_mail(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
if not settings.nextcloud_namespace:
detail = "sync disabled"
self._task_ok(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
return
result = nextcloud.sync_mail(ctx.username, wait=True)
if isinstance(result, dict) and result.get("status") == "ok":
self._task_ok(conn, ctx.request_code, "nextcloud_mail_sync", None, start)
return
status_val = result.get("status") if isinstance(result, dict) else "error"
summary = result.get("summary") if isinstance(result, dict) else None
detail = ""
if summary is not None:
detail = getattr(summary, "detail", "") or ""
if not detail and isinstance(result, dict):
detail = str(result.get("detail") or "")
detail = detail or str(status_val)
self._task_fail(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
except Exception as exc:
detail = safe_error_detail(exc, "failed to sync nextcloud")
self._task_fail(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
def _ensure_wger_account(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
full = keycloak_admin.get_user(ctx.user_id)
attrs = full.get("attributes") or {}
wger_password = _extract_attr(attrs, WGER_PASSWORD_ATTR)
wger_password_updated_at = _extract_attr(attrs, WGER_PASSWORD_UPDATED_ATTR)
if not wger_password:
wger_password = random_password(20)
keycloak_admin.set_user_attribute(ctx.username, WGER_PASSWORD_ATTR, wger_password)
if not wger_password_updated_at:
result = wger.sync_user(ctx.username, ctx.mailu_email, wger_password, wait=True)
status_val = result.get("status") if isinstance(result, dict) else "error"
if status_val != "ok":
detail = result.get("detail") if isinstance(result, dict) else ""
detail = detail or f"wger sync {status_val}"
raise RuntimeError(detail)
now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
keycloak_admin.set_user_attribute(ctx.username, WGER_PASSWORD_UPDATED_ATTR, now_iso)
self._task_ok(conn, ctx.request_code, "wger_account", None, start)
except Exception as exc:
detail = safe_error_detail(exc, "failed to provision wger")
self._task_fail(conn, ctx.request_code, "wger_account", detail, start)
def _ensure_firefly_account(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
full = keycloak_admin.get_user(ctx.user_id)
attrs = full.get("attributes") or {}
firefly_password = _extract_attr(attrs, FIREFLY_PASSWORD_ATTR)
firefly_password_updated_at = _extract_attr(attrs, FIREFLY_PASSWORD_UPDATED_ATTR)
if not firefly_password:
firefly_password = random_password(24)
keycloak_admin.set_user_attribute(ctx.username, FIREFLY_PASSWORD_ATTR, firefly_password)
if not firefly_password_updated_at:
result = firefly.sync_user(ctx.mailu_email, firefly_password, wait=True)
status_val = result.get("status") if isinstance(result, dict) else "error"
if status_val != "ok":
raise RuntimeError(f"firefly sync {status_val}")
now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
keycloak_admin.set_user_attribute(ctx.username, FIREFLY_PASSWORD_UPDATED_ATTR, now_iso)
self._task_ok(conn, ctx.request_code, "firefly_account", None, start)
except Exception as exc:
detail = safe_error_detail(exc, "failed to provision firefly")
self._task_fail(conn, ctx.request_code, "firefly_account", detail, start)
def _handle_vaultwarden_grandfathered(self, conn, ctx: RequestContext, start: datetime) -> None:
lookup = vaultwarden.find_user_by_email(ctx.contact_email)
if lookup.status == "rate_limited":
detail, _ = self._vaultwarden_rate_limit_detail()
self._task_pending(conn, ctx.request_code, "vaultwarden_invite", detail, start)
self._set_vaultwarden_attrs(ctx.username, ctx.contact_email, "rate_limited")
return
if lookup.ok and lookup.status == "present":
self._task_ok(conn, ctx.request_code, "vaultwarden_invite", "grandfathered", start)
self._set_vaultwarden_attrs(ctx.username, ctx.contact_email, "grandfathered")
return
if lookup.ok and lookup.status == "missing":
self._task_error(
conn,
ctx.request_code,
"vaultwarden_invite",
"vaultwarden account not found for recovery email",
start,
)
return
detail = lookup.detail or lookup.status
self._task_fail(conn, ctx.request_code, "vaultwarden_invite", detail, start)
def _ensure_vaultwarden_invite(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
if not self._vaultwarden_retry_due(conn, ctx.request_code):
return
if VAULTWARDEN_GRANDFATHERED_FLAG in ctx.approval_flags:
self._handle_vaultwarden_grandfathered(conn, ctx, start)
return
if not mailu.wait_for_mailbox(ctx.mailu_email, settings.mailu_mailbox_wait_timeout_sec):
try:
mailu.sync(reason="ariadne_vaultwarden_retry", force=True)
except Exception:
pass
if not mailu.wait_for_mailbox(ctx.mailu_email, settings.mailu_mailbox_wait_timeout_sec):
raise RuntimeError("mailbox not ready")
result = vaultwarden.invite_user(ctx.mailu_email)
if result.ok:
self._task_ok(conn, ctx.request_code, "vaultwarden_invite", result.status, start)
elif result.status == "rate_limited":
detail, _ = self._vaultwarden_rate_limit_detail()
self._task_pending(conn, ctx.request_code, "vaultwarden_invite", detail, start)
else:
detail = result.detail or result.status
self._task_error(conn, ctx.request_code, "vaultwarden_invite", detail, start)
status = result.status if result.status != "rate_limited" else "rate_limited"
self._set_vaultwarden_attrs(ctx.username, ctx.mailu_email, status)
except Exception as exc:
detail = safe_error_detail(exc, "failed to provision vaultwarden")
self._task_fail(conn, ctx.request_code, "vaultwarden_invite", detail, start)
def _send_welcome_email(self, request_code: str, username: str, contact_email: str) -> None:
if not settings.welcome_email_enabled:
return
if not contact_email:
return
try:
row = self._db.fetchone(
"SELECT welcome_email_sent_at FROM access_requests WHERE request_code = %s",
(request_code,),
)
if row and row.get("welcome_email_sent_at"):
return
onboarding_url = f"{settings.portal_public_base_url}/onboarding?code={request_code}"
mailer.send_welcome(contact_email, request_code, onboarding_url, username=username)
self._storage.mark_welcome_sent(request_code)
except MailerError:
return
__all__ = [
"FIREFLY_PASSWORD_ATTR",
"FIREFLY_PASSWORD_UPDATED_ATTR",
"MAILU_APP_PASSWORD_ATTR",
"MAILU_EMAIL_ATTR",
"MAILU_ENABLED_ATTR",
"MailerError",
"ProvisionOutcome",
"ProvisioningManager",
"RequestContext",
"VAULTWARDEN_GRANDFATHERED_FLAG",
"WGER_PASSWORD_ATTR",
"WGER_PASSWORD_UPDATED_ATTR",
"_extract_attr",
]

View File

@ -0,0 +1,401 @@
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from ..services.mailer import MailerError
from ..utils.errors import safe_error_detail
from .provisioning_protocol import (
FIREFLY_PASSWORD_ATTR,
FIREFLY_PASSWORD_UPDATED_ATTR,
MAILU_APP_PASSWORD_ATTR,
MAILU_EMAIL_ATTR,
MAILU_ENABLED_ATTR,
VAULTWARDEN_GRANDFATHERED_FLAG,
WGER_PASSWORD_ATTR,
WGER_PASSWORD_UPDATED_ATTR,
RequestContext,
_extract_attr,
)
class _ProvisioningAccountsMixin:
def _set_vaultwarden_attrs(self, username: str, email: str, status: str) -> None:
if not username or not email or not status:
return
try:
now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
self._keycloak_admin.set_user_attribute(username, "vaultwarden_email", email)
self._keycloak_admin.set_user_attribute(username, "vaultwarden_status", status)
self._keycloak_admin.set_user_attribute(username, "vaultwarden_synced_at", now_iso)
except Exception:
return
def _ready_for_retry(self, ctx: RequestContext) -> bool:
if ctx.status != "accounts_building":
return True
attempted_at = ctx.attempted_at
if not isinstance(attempted_at, datetime):
return True
if attempted_at.tzinfo is None:
attempted_at = attempted_at.replace(tzinfo=timezone.utc)
age_sec = (datetime.now(timezone.utc) - attempted_at).total_seconds()
return age_sec >= self._settings.provision_retry_cooldown_sec
def _require_verified_email(self, ctx: RequestContext) -> str:
if not isinstance(ctx.email_verified_at, datetime):
raise RuntimeError("missing verified email address")
email = ctx.contact_email.strip()
if not email:
raise RuntimeError("missing verified email address")
return email
def _ensure_email_unused(self, email: str, username: str) -> None:
existing_email_user = self._keycloak_admin.find_user_by_email(email)
if existing_email_user and (existing_email_user.get("username") or "") != username:
raise RuntimeError("email is already associated with an existing Atlas account")
def _new_user_payload(
self,
username: str,
email: str,
mailu_email: str,
first_name: str,
last_name: str,
) -> dict[str, Any]:
payload = {
"username": username,
"enabled": True,
"email": email,
"emailVerified": True,
"requiredActions": [],
"attributes": {
MAILU_EMAIL_ATTR: [mailu_email],
MAILU_ENABLED_ATTR: ["true"],
},
}
if first_name:
payload["firstName"] = first_name
if last_name:
payload["lastName"] = last_name
else:
payload["lastName"] = username
return payload
def _create_or_fetch_user(self, ctx: RequestContext) -> dict[str, Any]:
user = self._keycloak_admin.find_user(ctx.username)
if user:
return user
email = self._require_verified_email(ctx)
self._ensure_email_unused(email, ctx.username)
payload = self._new_user_payload(ctx.username, email, ctx.mailu_email, ctx.first_name, ctx.last_name)
try:
created_id = self._keycloak_admin.create_user(payload)
return self._keycloak_admin.get_user(created_id)
except Exception as exc:
detail = safe_error_detail(exc, "create user failed")
self._logger.warning(
"keycloak create user failed, checking for existing user",
extra={"event": "keycloak_user_fallback", "username": ctx.username, "detail": detail},
)
user = self._keycloak_admin.find_user(ctx.username)
if user:
return user
user = self._keycloak_admin.find_user_by_email(email)
if user:
return user
raise
def _fetch_full_user(self, user_id: str, fallback: dict[str, Any]) -> dict[str, Any]:
try:
return self._keycloak_admin.get_user(user_id)
except Exception:
return fallback
def _strip_totp_action(self, user_id: str, full_user: dict[str, Any]) -> None:
actions = full_user.get("requiredActions")
if not isinstance(actions, list) or "CONFIGURE_TOTP" not in actions:
return
new_actions = [action for action in actions if action != "CONFIGURE_TOTP"]
self._keycloak_admin.update_user_safe(user_id, {"requiredActions": new_actions})
def _ensure_contact_email(self, ctx: RequestContext, full_user: dict[str, Any]) -> None:
email_value = full_user.get("email")
if isinstance(email_value, str) and email_value.strip():
return
if isinstance(ctx.email_verified_at, datetime) and ctx.contact_email.strip():
self._keycloak_admin.update_user_safe(
ctx.user_id,
{"email": ctx.contact_email.strip(), "emailVerified": True},
)
def _ensure_mailu_attrs(self, ctx: RequestContext, full_user: dict[str, Any]) -> None:
attrs = full_user.get("attributes") or {}
if not isinstance(attrs, dict):
return
existing = _extract_attr(attrs, MAILU_EMAIL_ATTR)
if existing:
ctx.mailu_email = existing
else:
ctx.mailu_email = f"{ctx.username}@{self._settings.mailu_domain}"
self._keycloak_admin.set_user_attribute(ctx.username, MAILU_EMAIL_ATTR, ctx.mailu_email)
enabled_value = _extract_attr(attrs, MAILU_ENABLED_ATTR)
if enabled_value.lower() not in {"1", "true", "yes", "y", "on"}:
self._keycloak_admin.set_user_attribute(ctx.username, MAILU_ENABLED_ATTR, "true")
def _sync_user_profile(self, ctx: RequestContext, user: dict[str, Any]) -> None:
try:
full_user = self._fetch_full_user(ctx.user_id, user)
self._strip_totp_action(ctx.user_id, full_user)
self._ensure_contact_email(ctx, full_user)
self._ensure_mailu_attrs(ctx, full_user)
except Exception:
ctx.mailu_email = f"{ctx.username}@{self._settings.mailu_domain}"
def _ensure_keycloak_user(self, conn, ctx: RequestContext) -> bool:
start = datetime.now(timezone.utc)
try:
user = self._create_or_fetch_user(ctx)
ctx.user_id = str((user or {}).get("id") or "")
if not ctx.user_id:
raise RuntimeError("user id missing")
self._sync_user_profile(ctx, user)
self._task_ok(conn, ctx.request_code, "keycloak_user", None, start)
return True
except Exception as exc:
detail = safe_error_detail(exc, "failed to ensure user")
self._task_fail(conn, ctx.request_code, "keycloak_user", detail, start)
return False
def _ensure_keycloak_password(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
should_reset = ctx.status == "accounts_building" and ctx.revealed_at is None
password_value: str | None = None
if should_reset:
if isinstance(ctx.initial_password, str) and ctx.initial_password:
password_value = ctx.initial_password
elif ctx.initial_password is None:
password_value = self._random_password(20)
conn.execute(
"""
UPDATE access_requests
SET initial_password = %s
WHERE request_code = %s AND initial_password IS NULL
""",
(password_value, ctx.request_code),
)
ctx.initial_password = password_value
if password_value:
self._keycloak_admin.reset_password(ctx.user_id, password_value, temporary=False)
if isinstance(ctx.initial_password, str) and ctx.initial_password:
self._task_ok(conn, ctx.request_code, "keycloak_password", None, start)
elif ctx.revealed_at is not None:
detail = "initial password already revealed"
self._task_ok(conn, ctx.request_code, "keycloak_password", detail, start)
else:
raise RuntimeError("initial password missing")
except Exception as exc:
detail = safe_error_detail(exc, "failed to set password")
self._task_fail(conn, ctx.request_code, "keycloak_password", detail, start)
def _ensure_keycloak_groups(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
approved_flags = [flag for flag in ctx.approval_flags if flag in self._settings.allowed_flag_groups]
groups = list(dict.fromkeys(self._settings.default_user_groups + approved_flags))
for group_name in groups:
gid = self._keycloak_admin.get_group_id(group_name)
if not gid:
raise RuntimeError(f"group missing: {group_name}")
self._keycloak_admin.add_user_to_group(ctx.user_id, gid)
self._task_ok(conn, ctx.request_code, "keycloak_groups", None, start)
except Exception as exc:
detail = safe_error_detail(exc, "failed to add groups")
self._task_fail(conn, ctx.request_code, "keycloak_groups", detail, start)
def _ensure_mailu_app_password(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
full = self._keycloak_admin.get_user(ctx.user_id)
attrs = full.get("attributes") or {}
existing = _extract_attr(attrs, MAILU_APP_PASSWORD_ATTR)
if not existing:
self._keycloak_admin.set_user_attribute(ctx.username, MAILU_APP_PASSWORD_ATTR, self._random_password())
self._task_ok(conn, ctx.request_code, "mailu_app_password", None, start)
except Exception as exc:
detail = safe_error_detail(exc, "failed to set mail password")
self._task_fail(conn, ctx.request_code, "mailu_app_password", detail, start)
def _sync_mailu(self, conn, ctx: RequestContext) -> bool:
start = datetime.now(timezone.utc)
try:
if not self._mailu.ready():
detail = "mailu not configured"
self._task_ok(conn, ctx.request_code, "mailu_sync", detail, start)
return True
self._mailu.sync(reason="ariadne_access_approve", force=True)
mailbox_ready = self._mailu.wait_for_mailbox(
ctx.mailu_email,
self._settings.mailu_mailbox_wait_timeout_sec,
)
if not mailbox_ready:
raise RuntimeError("mailbox not ready")
self._task_ok(conn, ctx.request_code, "mailu_sync", None, start)
return True
except Exception as exc:
detail = safe_error_detail(exc, "failed to sync mailu")
self._task_fail(conn, ctx.request_code, "mailu_sync", detail, start)
return False
def _sync_nextcloud_mail(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
if not self._settings.nextcloud_namespace:
detail = "sync disabled"
self._task_ok(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
return
result = self._nextcloud.sync_mail(ctx.username, wait=True)
if isinstance(result, dict) and result.get("status") == "ok":
self._task_ok(conn, ctx.request_code, "nextcloud_mail_sync", None, start)
return
status_val = result.get("status") if isinstance(result, dict) else "error"
summary = result.get("summary") if isinstance(result, dict) else None
detail = ""
if summary is not None:
detail = getattr(summary, "detail", "") or ""
if not detail and isinstance(result, dict):
detail = str(result.get("detail") or "")
detail = detail or str(status_val)
self._task_fail(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
except Exception as exc:
detail = safe_error_detail(exc, "failed to sync nextcloud")
self._task_fail(conn, ctx.request_code, "nextcloud_mail_sync", detail, start)
def _ensure_wger_account(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
full = self._keycloak_admin.get_user(ctx.user_id)
attrs = full.get("attributes") or {}
wger_password = _extract_attr(attrs, WGER_PASSWORD_ATTR)
wger_password_updated_at = _extract_attr(attrs, WGER_PASSWORD_UPDATED_ATTR)
if not wger_password:
wger_password = self._random_password(20)
self._keycloak_admin.set_user_attribute(ctx.username, WGER_PASSWORD_ATTR, wger_password)
if not wger_password_updated_at:
result = self._wger.sync_user(ctx.username, ctx.mailu_email, wger_password, wait=True)
status_val = result.get("status") if isinstance(result, dict) else "error"
if status_val != "ok":
detail = result.get("detail") if isinstance(result, dict) else ""
detail = detail or f"wger sync {status_val}"
raise RuntimeError(detail)
now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
self._keycloak_admin.set_user_attribute(ctx.username, WGER_PASSWORD_UPDATED_ATTR, now_iso)
self._task_ok(conn, ctx.request_code, "wger_account", None, start)
except Exception as exc:
detail = safe_error_detail(exc, "failed to provision wger")
self._task_fail(conn, ctx.request_code, "wger_account", detail, start)
def _ensure_firefly_account(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
full = self._keycloak_admin.get_user(ctx.user_id)
attrs = full.get("attributes") or {}
firefly_password = _extract_attr(attrs, FIREFLY_PASSWORD_ATTR)
firefly_password_updated_at = _extract_attr(attrs, FIREFLY_PASSWORD_UPDATED_ATTR)
if not firefly_password:
firefly_password = self._random_password(24)
self._keycloak_admin.set_user_attribute(ctx.username, FIREFLY_PASSWORD_ATTR, firefly_password)
if not firefly_password_updated_at:
result = self._firefly.sync_user(ctx.mailu_email, firefly_password, wait=True)
status_val = result.get("status") if isinstance(result, dict) else "error"
if status_val != "ok":
raise RuntimeError(f"firefly sync {status_val}")
now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
self._keycloak_admin.set_user_attribute(ctx.username, FIREFLY_PASSWORD_UPDATED_ATTR, now_iso)
self._task_ok(conn, ctx.request_code, "firefly_account", None, start)
except Exception as exc:
detail = safe_error_detail(exc, "failed to provision firefly")
self._task_fail(conn, ctx.request_code, "firefly_account", detail, start)
def _handle_vaultwarden_grandfathered(self, conn, ctx: RequestContext, start: datetime) -> None:
lookup = self._vaultwarden.find_user_by_email(ctx.contact_email)
if lookup.status == "rate_limited":
detail, _ = self._vaultwarden_rate_limit_detail()
self._task_pending(conn, ctx.request_code, "vaultwarden_invite", detail, start)
self._set_vaultwarden_attrs(ctx.username, ctx.contact_email, "rate_limited")
return
if lookup.ok and lookup.status == "present":
self._task_ok(conn, ctx.request_code, "vaultwarden_invite", "grandfathered", start)
self._set_vaultwarden_attrs(ctx.username, ctx.contact_email, "grandfathered")
return
if lookup.ok and lookup.status == "missing":
self._task_error(
conn,
ctx.request_code,
"vaultwarden_invite",
"vaultwarden account not found for recovery email",
start,
)
return
detail = lookup.detail or lookup.status
self._task_fail(conn, ctx.request_code, "vaultwarden_invite", detail, start)
def _ensure_vaultwarden_invite(self, conn, ctx: RequestContext) -> None:
start = datetime.now(timezone.utc)
try:
if not self._vaultwarden_retry_due(conn, ctx.request_code):
return
if VAULTWARDEN_GRANDFATHERED_FLAG in ctx.approval_flags:
self._handle_vaultwarden_grandfathered(conn, ctx, start)
return
if not self._mailu.wait_for_mailbox(ctx.mailu_email, self._settings.mailu_mailbox_wait_timeout_sec):
try:
self._mailu.sync(reason="ariadne_vaultwarden_retry", force=True)
except Exception:
pass
if not self._mailu.wait_for_mailbox(ctx.mailu_email, self._settings.mailu_mailbox_wait_timeout_sec):
raise RuntimeError("mailbox not ready")
result = self._vaultwarden.invite_user(ctx.mailu_email)
if result.ok:
self._task_ok(conn, ctx.request_code, "vaultwarden_invite", result.status, start)
elif result.status == "rate_limited":
detail, _ = self._vaultwarden_rate_limit_detail()
self._task_pending(conn, ctx.request_code, "vaultwarden_invite", detail, start)
else:
detail = result.detail or result.status
self._task_error(conn, ctx.request_code, "vaultwarden_invite", detail, start)
status = result.status if result.status != "rate_limited" else "rate_limited"
self._set_vaultwarden_attrs(ctx.username, ctx.mailu_email, status)
except Exception as exc:
detail = safe_error_detail(exc, "failed to provision vaultwarden")
self._task_fail(conn, ctx.request_code, "vaultwarden_invite", detail, start)
def _send_welcome_email(self, request_code: str, username: str, contact_email: str) -> None:
if not self._settings.welcome_email_enabled:
return
if not contact_email:
return
try:
row = self._db.fetchone(
"SELECT welcome_email_sent_at FROM access_requests WHERE request_code = %s",
(request_code,),
)
if row and row.get("welcome_email_sent_at"):
return
onboarding_url = f"{self._settings.portal_public_base_url}/onboarding?code={request_code}"
self._mailer.send_welcome(contact_email, request_code, onboarding_url, username=username)
self._storage.mark_welcome_sent(request_code)
except MailerError:
return

View File

@ -0,0 +1,73 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
import hashlib
from typing import Any
MAILU_EMAIL_ATTR = "mailu_email"
MAILU_APP_PASSWORD_ATTR = "mailu_app_password"
MAILU_ENABLED_ATTR = "mailu_enabled"
WGER_PASSWORD_ATTR = "wger_password"
WGER_PASSWORD_UPDATED_ATTR = "wger_password_updated_at"
FIREFLY_PASSWORD_ATTR = "firefly_password"
FIREFLY_PASSWORD_UPDATED_ATTR = "firefly_password_updated_at"
VAULTWARDEN_GRANDFATHERED_FLAG = "vaultwarden_grandfathered"
_RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
_RETRYABLE_TOKENS = (
"timeout",
"temporar",
"rate limited",
"mailbox not ready",
"connection refused",
"connection reset",
"network is unreachable",
"dns",
"name resolution",
"service unavailable",
"bad gateway",
"gateway timeout",
)
@dataclass(frozen=True)
class ProvisionOutcome:
ok: bool
status: str
@dataclass
class RequestContext:
request_code: str
username: str
first_name: str
last_name: str
contact_email: str
email_verified_at: datetime | None
status: str
initial_password: str | None
revealed_at: datetime | None
attempted_at: datetime | None
approval_flags: list[str]
user_id: str = ""
mailu_email: str = ""
def _advisory_lock_id(request_code: str) -> int:
digest = hashlib.sha256(request_code.encode("utf-8")).digest()
return int.from_bytes(digest[:8], "big", signed=True)
def _extract_attr(attrs: Any, key: str) -> str:
if not isinstance(attrs, dict):
return ""
raw = attrs.get(key)
if isinstance(raw, list):
for item in raw:
if isinstance(item, str) and item.strip():
return item.strip()
return ""
if isinstance(raw, str) and raw.strip():
return raw.strip()
return ""

View File

@ -0,0 +1,176 @@
from __future__ import annotations
from datetime import datetime, timedelta, timezone
import re
from ..db.storage import TaskRunRecord
from .provisioning_protocol import _RETRYABLE_HTTP_CODES, _RETRYABLE_TOKENS
class _ProvisioningTaskMixin:
def _ensure_task_rows(self, conn, request_code: str, tasks: list[str]) -> None:
if not tasks:
return
conn.execute(
"""
INSERT INTO access_request_tasks (request_code, task, status, detail, updated_at)
SELECT %s, task, 'pending', NULL, NOW()
FROM UNNEST(%s::text[]) AS task
ON CONFLICT (request_code, task) DO NOTHING
""",
(request_code, tasks),
)
def _upsert_task(self, conn, request_code: str, task: str, status: str, detail: str | None = None) -> None:
conn.execute(
"""
INSERT INTO access_request_tasks (request_code, task, status, detail, updated_at)
VALUES (%s, %s, %s, %s, NOW())
ON CONFLICT (request_code, task)
DO UPDATE SET status = EXCLUDED.status, detail = EXCLUDED.detail, updated_at = NOW()
""",
(request_code, task, status, detail),
)
def _task_statuses(self, conn, request_code: str) -> dict[str, str]:
rows = conn.execute(
"SELECT task, status FROM access_request_tasks WHERE request_code = %s",
(request_code,),
).fetchall()
output: dict[str, str] = {}
for row in rows:
task = row.get("task") if isinstance(row, dict) else None
status = row.get("status") if isinstance(row, dict) else None
if isinstance(task, str) and isinstance(status, str):
output[task] = status
return output
def _all_tasks_ok(self, conn, request_code: str, tasks: list[str]) -> bool:
statuses = self._task_statuses(conn, request_code)
for task in tasks:
if statuses.get(task) != "ok":
return False
return True
def _record_task(self, request_code: str, task: str, status: str, detail: str | None, started: datetime) -> None:
finished = datetime.now(timezone.utc)
duration_sec = (finished - started).total_seconds()
self._record_task_run_metric(task, status, duration_sec)
self._logger.info(
"task run",
extra={
"event": "task_run",
"request_code": request_code,
"task": task,
"status": status,
"duration_sec": round(duration_sec, 3),
"detail": detail or "",
},
)
try:
self._storage.record_event(
"provision_task",
{
"request_code": request_code,
"task": task,
"status": status,
"duration_sec": round(duration_sec, 3),
"detail": detail or "",
},
)
except Exception:
pass
try:
self._storage.record_task_run(
TaskRunRecord(
request_code=request_code,
task=task,
status=status,
detail=detail,
started_at=started,
finished_at=finished,
duration_ms=int(duration_sec * 1000),
)
)
except Exception:
pass
def _task_ok(self, conn, request_code: str, task: str, detail: str | None, started: datetime) -> None:
self._upsert_task(conn, request_code, task, "ok", detail)
self._record_task(request_code, task, "ok", detail, started)
def _task_error(self, conn, request_code: str, task: str, detail: str, started: datetime) -> None:
self._upsert_task(conn, request_code, task, "error", detail)
self._record_task(request_code, task, "error", detail, started)
def _task_pending(self, conn, request_code: str, task: str, detail: str, started: datetime) -> None:
self._upsert_task(conn, request_code, task, "pending", detail)
self._record_task(request_code, task, "pending", detail, started)
def _is_retryable_detail(self, detail: str) -> bool:
if not detail:
return False
detail_lower = detail.lower()
match = re.match(r"^http\s+(\d{3})", detail_lower)
if match:
code = int(match.group(1))
if code in _RETRYABLE_HTTP_CODES:
return True
return any(token in detail_lower for token in _RETRYABLE_TOKENS)
def _retryable_detail(self, detail: str) -> str:
cleaned = detail.strip() if isinstance(detail, str) else ""
if not cleaned:
return "retryable: temporary failure"
return f"retryable: {cleaned}"
def _task_fail(self, conn, request_code: str, task: str, detail: str, started: datetime) -> None:
detail_lower = detail.lower()
if "missing verified email address" in detail_lower or "email not verified" in detail_lower:
self._task_pending(conn, request_code, task, "blocked: email not verified", started)
return
if self._is_retryable_detail(detail):
self._task_pending(conn, request_code, task, self._retryable_detail(detail), started)
return
self._task_error(conn, request_code, task, detail, started)
def _vaultwarden_rate_limit_detail(self) -> tuple[str, datetime]:
retry_at = datetime.now(timezone.utc) + timedelta(
seconds=float(self._settings.vaultwarden_admin_rate_limit_backoff_sec)
)
retry_iso = retry_at.strftime("%Y-%m-%dT%H:%M:%SZ")
return f"rate limited until {retry_iso}", retry_at
@staticmethod
def _parse_retry_at(detail: str) -> datetime | None:
prefix = "rate limited until "
if not isinstance(detail, str) or not detail.startswith(prefix):
return None
ts = detail[len(prefix) :].strip()
for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z"):
try:
parsed = datetime.strptime(ts, fmt)
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=timezone.utc)
return parsed
except ValueError:
continue
return None
def _vaultwarden_retry_due(self, conn, request_code: str) -> bool:
row = conn.execute(
"""
SELECT status, detail
FROM access_request_tasks
WHERE request_code = %s AND task = 'vaultwarden_invite'
""",
(request_code,),
).fetchone()
if not isinstance(row, dict):
return True
if row.get("status") != "pending":
return True
retry_at = self._parse_retry_at(row.get("detail") or "")
if not retry_at:
return True
return datetime.now(timezone.utc) >= retry_at

View File

@ -72,18 +72,16 @@ CLUSTER_STATE_KUSTOMIZATIONS_NOT_READY = Gauge(
def record_task_run(task: str, status: str, duration_sec: float | None) -> None:
"""Increment task counters and duration histograms for one run."""
TASK_RUNS_TOTAL.labels(task=task, status=status).inc()
if duration_sec is not None:
TASK_DURATION_SECONDS.labels(task=task, status=status).observe(duration_sec)
def record_schedule_state(
task: str,
last_run_ts: float | None,
last_success_ts: float | None,
next_run_ts: float | None,
ok: bool | None,
) -> None:
def record_schedule_state(task: str, last_run_ts: float | None, last_success_ts: float | None, next_run_ts: float | None, ok: bool | None) -> None:
"""Publish the latest scheduler timestamps and status for a task."""
if last_run_ts:
SCHEDULE_LAST_RUN_TS.labels(task=task).set(last_run_ts)
if last_success_ts:
@ -97,17 +95,15 @@ def record_schedule_state(
def set_access_request_counts(counts: dict[str, int]) -> None:
"""Set access-request gauges grouped by lifecycle status."""
for status, count in counts.items():
ACCESS_REQUESTS.labels(status=status).set(count)
def set_cluster_state_metrics(
collected_at: datetime,
nodes_total: int | None,
nodes_ready: int | None,
pods_running: float | None,
kustomizations_not_ready: int | None,
) -> None:
def set_cluster_state_metrics(collected_at: datetime, nodes_total: int | None, nodes_ready: int | None, pods_running: float | None, kustomizations_not_ready: int | None) -> None:
"""Set cluster-state gauges from the most recent collector snapshot."""
CLUSTER_STATE_LAST_TS.set(collected_at.timestamp())
if nodes_total is not None:
CLUSTER_STATE_NODES_TOTAL.set(nodes_total)

View File

@ -24,6 +24,8 @@ def _build_db(dsn: str, application_name: str) -> Database:
def main() -> None:
"""Run configured Ariadne and portal database migrations."""
if not settings.ariadne_run_migrations:
return

View File

@ -22,6 +22,8 @@ class CronTask:
class CronScheduler:
"""Run named cron tasks while recording schedule state and outcomes."""
def __init__(self, storage: Storage, tick_sec: float = 5.0) -> None:
self._storage = storage
self._tick_sec = tick_sec
@ -41,6 +43,7 @@ class CronScheduler:
def start(self) -> None:
if self._thread and self._thread.is_alive():
return
self._hydrate_schedule_metrics()
self._stop_event.clear()
self._thread = threading.Thread(target=self._run_loop, name="ariadne-scheduler", daemon=True)
self._thread.start()
@ -83,6 +86,40 @@ class CronScheduler:
)
time.sleep(self._tick_sec)
def _hydrate_schedule_metrics(self) -> None:
try:
states = self._storage.list_schedule_states()
except AttributeError:
return
except Exception as exc:
self._logger.warning(
"schedule metric hydration failed",
extra={"event": "schedule_hydration_error", "detail": str(exc)},
)
return
known_tasks = set(self._tasks)
for state in states:
if state.task_name not in known_tasks:
continue
last_finished = state.last_finished_at or state.last_started_at
last_success = last_finished if state.last_status == "ok" else None
if state.last_status == "ok":
ok: bool | None = True
elif state.last_status == "error":
ok = False
else:
ok = None
record_schedule_state(
state.task_name,
state.last_started_at.timestamp() if state.last_started_at else None,
last_success.timestamp() if last_success else None,
self._next_run.get(state.task_name).timestamp()
if self._next_run.get(state.task_name)
else None,
ok,
)
def _execute_task(self, task: CronTask) -> None:
started = datetime.now(timezone.utc)
status = "ok"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,180 @@
from __future__ import annotations
from typing import Any
from .cluster_state_contract import *
from .cluster_state_relationships import *
def _severity_rank(value: Any) -> int:
if value == "critical":
return 0
if value == "warning":
return 1
return 2
def _pvc_pressure_signals(metrics: dict[str, Any]) -> list[dict[str, Any]]:
pvc_top = _pvc_top(metrics.get("pvc_usage_top", []))
if not pvc_top:
return []
output: list[dict[str, Any]] = []
for entry in pvc_top:
used = entry.get("used_percent")
if not isinstance(used, (int, float)) or used < _PVC_PRESSURE_THRESHOLD:
continue
output.append(
{
"scope": "pvc",
"target": f"{entry.get('namespace')}/{entry.get('pvc')}",
"metric": "used_percent",
"current": used,
"severity": "warning" if used < _PVC_CRITICAL_THRESHOLD else "critical",
}
)
return output
def _build_anomalies(metrics: dict[str, Any], nodes_summary: dict[str, Any], workloads_health: dict[str, Any], kustomizations: dict[str, Any], events: dict[str, Any]) -> list[dict[str, Any]]:
anomalies: list[dict[str, Any]] = []
_append_pod_anomalies(anomalies, metrics)
_append_workload_anomalies(anomalies, workloads_health)
_append_flux_anomalies(anomalies, kustomizations)
_append_job_failure_anomalies(anomalies, metrics)
_append_pvc_anomalies(anomalies, metrics)
_append_node_anomalies(anomalies, nodes_summary)
_append_event_anomalies(anomalies, events)
return anomalies
def _append_pod_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None:
pods_pending = metrics.get("pods_pending") or 0
pods_failed = metrics.get("pods_failed") or 0
if pods_pending:
anomalies.append(
{
"kind": "pods_pending",
"severity": "warning",
"summary": f"{int(pods_pending)} pods pending",
}
)
if pods_failed:
anomalies.append(
{
"kind": "pods_failed",
"severity": "critical",
"summary": f"{int(pods_failed)} pods failed",
}
)
def _append_workload_anomalies(anomalies: list[dict[str, Any]], workloads_health: dict[str, Any]) -> None:
for key in ("deployments", "statefulsets", "daemonsets"):
entry = workloads_health.get(key) if isinstance(workloads_health.get(key), dict) else {}
not_ready = entry.get("not_ready") or 0
if not_ready:
anomalies.append(
{
"kind": f"{key}_not_ready",
"severity": "warning",
"summary": f"{int(not_ready)} {key} not ready",
"items": entry.get("items"),
}
)
def _append_flux_anomalies(anomalies: list[dict[str, Any]], kustomizations: dict[str, Any]) -> None:
flux_not_ready = (kustomizations or {}).get("not_ready") or 0
if flux_not_ready:
anomalies.append(
{
"kind": "flux_not_ready",
"severity": "warning",
"summary": f"{int(flux_not_ready)} Flux kustomizations not ready",
"items": (kustomizations or {}).get("items"),
}
)
def _append_job_failure_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None:
job_failures = metrics.get("job_failures_24h") or []
job_failures = [
entry for entry in job_failures if isinstance(entry, dict) and (entry.get("value") or 0) > 0
]
if job_failures:
anomalies.append(
{
"kind": "job_failures_24h",
"severity": "warning",
"summary": "Job failures in last 24h",
"items": job_failures[:5],
}
)
def _append_pvc_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None:
pvc_pressure = _pvc_pressure_entries(metrics)
if pvc_pressure:
anomalies.append(
{
"kind": "pvc_pressure",
"severity": "warning",
"summary": f"PVCs above {_PVC_PRESSURE_THRESHOLD:.0f}% usage",
"items": pvc_pressure[:5],
}
)
def _pvc_pressure_entries(metrics: dict[str, Any]) -> list[dict[str, Any]]:
pvc_top = _pvc_top(metrics.get("pvc_usage_top") or [])
return [
entry
for entry in pvc_top
if isinstance(entry, dict)
and isinstance(entry.get("used_percent"), (int, float))
and float(entry.get("used_percent") or 0) >= _PVC_PRESSURE_THRESHOLD
]
def _append_node_anomalies(anomalies: list[dict[str, Any]], nodes_summary: dict[str, Any]) -> None:
if not nodes_summary:
return
pressure_nodes = nodes_summary.get("pressure_nodes") or {}
flagged = [
name for names in pressure_nodes.values() if isinstance(names, list) for name in names if name
]
if flagged:
anomalies.append(
{
"kind": "node_pressure",
"severity": "warning",
"summary": f"{len(flagged)} nodes report pressure",
"items": sorted(set(flagged)),
}
)
unschedulable = nodes_summary.get("unschedulable_nodes") or []
if unschedulable:
anomalies.append(
{
"kind": "unschedulable_nodes",
"severity": "info",
"summary": f"{len(unschedulable)} nodes unschedulable",
"items": unschedulable,
}
)
def _append_event_anomalies(anomalies: list[dict[str, Any]], events: dict[str, Any]) -> None:
if not events:
return
warnings = events.get("warnings_total") or 0
if warnings:
anomalies.append(
{
"kind": "event_warnings",
"severity": "info",
"summary": f"{int(warnings)} warning events",
"items": events.get("warnings") or [],
}
)
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,117 @@
from __future__ import annotations
from typing import Any
from .cluster_state_anomalies import *
from .cluster_state_contract import *
from .cluster_state_health import *
def _node_attention_score(node: dict[str, Any]) -> tuple[float, list[str]]:
score = 0.0
reasons: list[str] = []
disk = node.get("disk")
if isinstance(disk, (int, float)) and disk >= _NODE_DISK_ALERT:
score += 3 + (disk - _NODE_DISK_ALERT) / 10
reasons.append(f"disk {disk:.1f}%")
cpu = node.get("cpu")
if isinstance(cpu, (int, float)) and cpu >= _NODE_CPU_ALERT:
score += 2 + (cpu - _NODE_CPU_ALERT) / 20
reasons.append(f"cpu {cpu:.1f}%")
ram = node.get("ram")
if isinstance(ram, (int, float)) and ram >= _NODE_RAM_ALERT:
score += 2 + (ram - _NODE_RAM_ALERT) / 20
reasons.append(f"ram {ram:.1f}%")
baseline = node.get("baseline") if isinstance(node.get("baseline"), dict) else {}
for key, label, multiplier in (("net", "net", _NET_SPIKE_MULTIPLIER), ("io", "io", _IO_SPIKE_MULTIPLIER)):
current = node.get(key)
base = baseline.get(key) if isinstance(baseline.get(key), dict) else {}
base_max = base.get("max")
if isinstance(current, (int, float)) and isinstance(base_max, (int, float)) and base_max > 0:
if current > base_max * multiplier:
score += 1.5
reasons.append(f"{label} {current:.2f} > {multiplier:.1f}x baseline")
pressure = node.get("pressure_flags") if isinstance(node.get("pressure_flags"), list) else []
if pressure:
score += 2
reasons.append("pressure flags")
return score, reasons
def _node_attention_entries(node_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
entries: list[dict[str, Any]] = []
for node in node_context:
if not isinstance(node, dict):
continue
name = node.get("node")
if not isinstance(name, str) or not name:
continue
score, reasons = _node_attention_score(node)
if score > 0:
entries.append(
{
"kind": "node",
"target": name,
"score": round(score, 2),
"reasons": reasons,
}
)
return entries
def _pvc_attention_entries(metrics: dict[str, Any]) -> list[dict[str, Any]]:
entries: list[dict[str, Any]] = []
for item in _pvc_pressure_entries(metrics):
if not isinstance(item, dict):
continue
used = float(item.get("used_percent") or 0)
entries.append(
{
"kind": "pvc",
"target": f"{item.get('namespace')}/{item.get('pvc')}",
"score": round(1 + (used - _PVC_PRESSURE_THRESHOLD) / 10, 2),
"reasons": [f"usage {used:.1f}%"],
}
)
return entries
def _pod_attention_entries(pod_issues: dict[str, Any]) -> list[dict[str, Any]]:
entries: list[dict[str, Any]] = []
pending = pod_issues.get("pending_over_15m") or 0
if pending:
entries.append(
{
"kind": "pods",
"target": "pending",
"score": float(pending),
"reasons": [f"{int(pending)} pending >15m"],
}
)
return entries
def _workload_attention_entries(workloads_health: dict[str, Any]) -> list[dict[str, Any]]:
entries: list[dict[str, Any]] = []
for item in _workload_not_ready_items(workloads_health)[:5]:
entries.append(
{
"kind": "workload",
"target": f"{item.get('namespace')}/{item.get('name')}",
"score": 2.0,
"reasons": [f"{item.get('ready')}/{item.get('desired')} ready"],
}
)
return entries
def _build_attention_ranked(metrics: dict[str, Any], node_context: list[dict[str, Any]], pod_issues: dict[str, Any], workloads_health: dict[str, Any]) -> list[dict[str, Any]]:
entries = (
_node_attention_entries(node_context)
+ _pvc_attention_entries(metrics)
+ _pod_attention_entries(pod_issues)
+ _workload_attention_entries(workloads_health)
)
entries.sort(key=lambda item: (-(item.get("score") or 0), item.get("kind") or "", item.get("target") or ""))
return entries[:5]
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,121 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
_VALUE_PAIR_LEN = 2
_RATE_WINDOW = "5m"
_RESTARTS_WINDOW = "1h"
_BASELINE_WINDOW = "24h"
_TREND_WINDOWS = ("1h", "6h", "24h")
_TREND_NODE_LIMIT = 30
_TREND_NAMESPACE_LIMIT = 20
_TREND_PVC_LIMIT = 10
_TREND_JOB_LIMIT = 10
_TREND_POD_LIMIT = 15
_NODE_DISK_ALERT = 80.0
_NODE_CPU_ALERT = 80.0
_NODE_RAM_ALERT = 80.0
_NET_SPIKE_MULTIPLIER = 2.0
_IO_SPIKE_MULTIPLIER = 2.0
_NODE_UNAME_LABEL = 'node_uname_info{nodename!=""}'
_WORKLOAD_LABEL_KEYS = (
"app.kubernetes.io/name",
"app",
"k8s-app",
"app.kubernetes.io/instance",
"release",
)
_SYSTEM_NAMESPACES = {
"kube-system",
"kube-public",
"kube-node-lease",
"flux-system",
"monitoring",
"logging",
"traefik",
"cert-manager",
"maintenance",
"postgres",
"vault",
}
_WORKLOAD_ALLOWED_NAMESPACES = {
"maintenance",
}
_BASELINE_DELTA_WARN = 50.0
_BASELINE_DELTA_CRIT = 100.0
_SIGNAL_LIMIT = 15
_PROFILE_LIMIT = 6
_WORKLOAD_INDEX_LIMIT = 20
_NODE_WORKLOAD_LIMIT = 12
_NODE_WORKLOAD_TOP = 3
_EVENTS_SUMMARY_LIMIT = 5
_PVC_CRITICAL_THRESHOLD = 90.0
_CAPACITY_KEYS = {
"cpu",
"memory",
"pods",
"ephemeral-storage",
}
_PRESSURE_TYPES = {
"MemoryPressure",
"DiskPressure",
"PIDPressure",
"NetworkUnavailable",
}
_EVENTS_MAX = 20
_EVENT_WARNING = "Warning"
_PHASE_SEVERITY = {
"Failed": 3,
"Pending": 2,
"Unknown": 1,
}
_PENDING_15M_HOURS = 0.25
_LOAD_TOP_COUNT = 5
_NAMESPACE_TOP_COUNT = 5
_PVC_PRESSURE_THRESHOLD = 80.0
_ALERT_TOP_LIMIT = 10
_POD_REASON_LIMIT = 10
_POD_REASON_TREND_LIMIT = 10
_NAMESPACE_ISSUE_LIMIT = 8
_CROSS_NODE_TOP = 3
_CROSS_NAMESPACE_TOP = 3
_CROSS_PVC_TOP = 3
_POD_TERMINATED_REASONS = {
"oom_killed": "OOMKilled",
"error": "Error",
}
_POD_WAITING_REASONS = {
"crash_loop": "CrashLoopBackOff",
"image_pull_backoff": "ImagePullBackOff",
"err_image_pull": "ErrImagePull",
"create_config_error": "CreateContainerConfigError",
}
_DELTA_TOP_LIMIT = 6
_REASON_TOP_LIMIT = 5
@dataclass(frozen=True)
class ClusterStateSummary:
nodes_total: int | None
nodes_ready: int | None
pods_running: int | None
kustomizations_not_ready: int | None
errors: int
@dataclass(frozen=True)
class SignalContext:
metrics: dict[str, Any]
node_context: list[dict[str, Any]]
namespace_context: list[dict[str, Any]]
workloads_health: dict[str, Any]
pod_issues: dict[str, Any]
kustomizations: dict[str, Any]
def _items(payload: dict[str, Any]) -> list[dict[str, Any]]:
items = payload.get("items") if isinstance(payload.get("items"), list) else []
return [item for item in items if isinstance(item, dict)]
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,105 @@
from __future__ import annotations
import sys
from typing import Any
from ..k8s.client import get_json as _default_get_json
from .cluster_state_flux_events import *
from .cluster_state_nodes import *
from .cluster_state_pods import *
from .cluster_state_workloads import *
PodFetchResult = tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]], dict[str, Any]]
def _get_json(path: str) -> dict[str, Any]:
facade = sys.modules.get("ariadne.services.cluster_state")
getter = getattr(facade, "get_json", _default_get_json) if facade is not None else _default_get_json
return getter(path)
def _fetch_nodes(errors: list[str]) -> tuple[dict[str, Any], list[dict[str, Any]], dict[str, Any]]:
nodes: dict[str, Any] = {}
details: list[dict[str, Any]] = []
summary: dict[str, Any] = {}
try:
payload = _get_json("/api/v1/nodes")
nodes = _summarize_nodes(payload)
details = _node_details(payload)
summary = _summarize_inventory(details)
except Exception as exc:
errors.append(f"nodes: {exc}")
return nodes, details, summary
def _fetch_flux(errors: list[str]) -> dict[str, Any]:
try:
payload = _get_json(
"/apis/kustomize.toolkit.fluxcd.io/v1/namespaces/flux-system/kustomizations"
)
return _summarize_kustomizations(payload)
except Exception as exc:
errors.append(f"flux: {exc}")
return {}
def _fetch_pods(errors: list[str]) -> PodFetchResult:
workloads: list[dict[str, Any]] = []
namespace_pods: list[dict[str, Any]] = []
namespace_nodes: list[dict[str, Any]] = []
node_pods: list[dict[str, Any]] = []
pod_issues: dict[str, Any] = {}
try:
pods_payload = _get_json("/api/v1/pods?limit=5000")
workloads = _summarize_workloads(pods_payload)
namespace_pods = _summarize_namespace_pods(pods_payload)
namespace_nodes = _summarize_namespace_nodes(pods_payload)
node_pods = _summarize_node_pods(pods_payload)
pod_issues = _summarize_pod_issues(pods_payload)
except Exception as exc:
errors.append(f"pods: {exc}")
return workloads, namespace_pods, namespace_nodes, node_pods, pod_issues
def _fetch_jobs(errors: list[str]) -> dict[str, Any]:
try:
jobs_payload = _get_json("/apis/batch/v1/jobs?limit=2000")
return _summarize_jobs(jobs_payload)
except Exception as exc:
errors.append(f"jobs: {exc}")
return {}
def _fetch_longhorn(errors: list[str]) -> dict[str, Any]:
try:
payload = _get_json(
"/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes"
)
return _summarize_longhorn_volumes(payload)
except Exception as exc:
errors.append(f"longhorn: {exc}")
return {}
def _fetch_workload_health(errors: list[str]) -> dict[str, Any]:
try:
deployments_payload = _get_json("/apis/apps/v1/deployments?limit=2000")
statefulsets_payload = _get_json("/apis/apps/v1/statefulsets?limit=2000")
daemonsets_payload = _get_json("/apis/apps/v1/daemonsets?limit=2000")
deployments = _summarize_deployments(deployments_payload)
statefulsets = _summarize_statefulsets(statefulsets_payload)
daemonsets = _summarize_daemonsets(daemonsets_payload)
return _summarize_workload_health(deployments, statefulsets, daemonsets)
except Exception as exc:
errors.append(f"workloads_health: {exc}")
return {}
def _fetch_events(errors: list[str]) -> dict[str, Any]:
try:
events_payload = _get_json("/api/v1/events?limit=2000")
return _summarize_events(events_payload)
except Exception as exc:
errors.append(f"events: {exc}")
return {}
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,117 @@
from __future__ import annotations
from datetime import datetime
from typing import Any
from .cluster_state_contract import *
from .cluster_state_nodes import *
def _summarize_kustomizations(payload: dict[str, Any]) -> dict[str, Any]:
not_ready: list[dict[str, Any]] = []
for item in _items(payload):
metadata = item.get("metadata") if isinstance(item.get("metadata"), dict) else {}
spec = item.get("spec") if isinstance(item.get("spec"), dict) else {}
status = item.get("status") if isinstance(item.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
conditions = status.get("conditions")
ready, reason, message = _condition_status(conditions, "Ready")
suspended = bool(spec.get("suspend"))
if ready is True and not suspended:
continue
not_ready.append(
{
"name": name,
"namespace": namespace,
"ready": ready,
"suspended": suspended,
"reason": reason,
"message": message,
}
)
not_ready.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
return {
"total": len(_items(payload)),
"not_ready": len(not_ready),
"items": not_ready,
}
def _namespace_allowed(namespace: str) -> bool:
if not namespace:
return False
if namespace in _WORKLOAD_ALLOWED_NAMESPACES:
return True
return namespace not in _SYSTEM_NAMESPACES
def _event_timestamp(event: dict[str, Any]) -> str:
for key in ("eventTime", "lastTimestamp", "firstTimestamp"):
value = event.get(key)
if isinstance(value, str) and value:
return value
return ""
def _event_sort_key(timestamp: str) -> float:
if not timestamp:
return 0.0
try:
return datetime.fromisoformat(timestamp.replace("Z", "+00:00")).timestamp()
except ValueError:
return 0.0
def _summarize_events(payload: dict[str, Any]) -> dict[str, Any]:
warnings: list[dict[str, Any]] = []
by_reason: dict[str, int] = {}
by_namespace: dict[str, int] = {}
for event in _items(payload):
metadata = event.get("metadata") if isinstance(event.get("metadata"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
if not _namespace_allowed(namespace):
continue
event_type = event.get("type") if isinstance(event.get("type"), str) else ""
if event_type != _EVENT_WARNING:
continue
reason = event.get("reason") if isinstance(event.get("reason"), str) else ""
message = event.get("message") if isinstance(event.get("message"), str) else ""
count = event.get("count") if isinstance(event.get("count"), int) else 1
involved = (
event.get("involvedObject") if isinstance(event.get("involvedObject"), dict) else {}
)
timestamp = _event_timestamp(event)
warnings.append(
{
"namespace": namespace,
"reason": reason,
"message": message,
"count": count,
"last_seen": timestamp,
"object_kind": involved.get("kind") or "",
"object_name": involved.get("name") or "",
}
)
if reason:
by_reason[reason] = by_reason.get(reason, 0) + count
if namespace:
by_namespace[namespace] = by_namespace.get(namespace, 0) + count
warnings.sort(key=lambda item: _event_sort_key(item.get("last_seen") or ""), reverse=True)
top = warnings[:_EVENTS_MAX]
top_reason = ""
top_reason_count = 0
if by_reason:
top_reason, top_reason_count = sorted(
by_reason.items(), key=lambda item: (-item[1], item[0])
)[0]
latest_warning = top[0] if top else None
return {
"warnings_total": len(warnings),
"warnings_by_reason": by_reason,
"warnings_by_namespace": by_namespace,
"warnings_recent": top,
"warnings_top_reason": {"reason": top_reason, "count": top_reason_count},
"warnings_latest": latest_warning,
}
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,75 @@
from __future__ import annotations
from typing import Any
from .cluster_state_contract import *
HealthRows = list[dict[str, Any]]
def _health_bullets(metrics: dict[str, Any], nodes_summary: dict[str, Any], workloads_health: dict[str, Any], anomalies: HealthRows) -> list[str]:
bullets: list[str] = []
nodes_total = metrics.get("nodes_total")
nodes_ready = metrics.get("nodes_ready")
if nodes_total is not None and nodes_ready is not None:
bullets.append(f"Nodes ready: {int(nodes_ready)}/{int(nodes_total)}")
pods_running = metrics.get("pods_running") or 0
pods_pending = metrics.get("pods_pending") or 0
pods_failed = metrics.get("pods_failed") or 0
bullets.append(f"Pods: {int(pods_running)} running, {int(pods_pending)} pending, {int(pods_failed)} failed")
not_ready = 0
for key in ("deployments", "statefulsets", "daemonsets"):
entry = workloads_health.get(key) if isinstance(workloads_health.get(key), dict) else {}
not_ready += int(entry.get("not_ready") or 0)
if not_ready:
bullets.append(f"Workloads not ready: {not_ready}")
else:
bullets.append("Workloads: all ready")
if anomalies:
top = anomalies[0].get("summary") if isinstance(anomalies[0], dict) else None
if isinstance(top, str) and top:
bullets.append(f"Top concern: {top}")
return bullets[:4]
def _workload_not_ready_items(workloads_health: dict[str, Any]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for key in ("deployments", "statefulsets", "daemonsets"):
entry = workloads_health.get(key) if isinstance(workloads_health.get(key), dict) else {}
for item in entry.get("items") or []:
if not isinstance(item, dict):
continue
output.append(
{
"kind": key[:-1],
"namespace": item.get("namespace") or "",
"name": item.get("name") or "",
"desired": item.get("desired"),
"ready": item.get("ready"),
}
)
output.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
return output
def _pod_restarts_top(metrics: dict[str, Any]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in metrics.get("top_restarts_1h") or []:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
pod = metric.get("pod")
if not isinstance(namespace, str) or not isinstance(pod, str):
continue
output.append(
{
"namespace": namespace,
"pod": pod,
"value": item.get("value"),
}
)
output.sort(key=lambda item: (-(item.get("value") or 0), item.get("namespace") or ""))
return output[:5]
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,361 @@
from __future__ import annotations
from typing import Any
from .cluster_state_anomalies import *
from .cluster_state_contract import *
from .cluster_state_flux_events import *
from .cluster_state_health import *
from .cluster_state_vm_client import *
from .cluster_state_vm_trends import *
from .cluster_state_vm_usage import *
def _collect_vm_core(metrics: dict[str, Any], errors: list[str]) -> None:
try:
metrics["nodes_total"] = _vm_scalar("count(kube_node_info)")
metrics["nodes_ready"] = _vm_scalar(
"count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})"
)
metrics["capacity_cpu"] = _vm_scalar("sum(kube_node_status_capacity_cpu_cores)")
metrics["allocatable_cpu"] = _vm_scalar("sum(kube_node_status_allocatable_cpu_cores)")
metrics["capacity_mem_bytes"] = _vm_scalar("sum(kube_node_status_capacity_memory_bytes)")
metrics["allocatable_mem_bytes"] = _vm_scalar("sum(kube_node_status_allocatable_memory_bytes)")
metrics["capacity_pods"] = _vm_scalar("sum(kube_node_status_capacity_pods)")
metrics["allocatable_pods"] = _vm_scalar("sum(kube_node_status_allocatable_pods)")
metrics["pods_running"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Running\"})")
metrics["pods_pending"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Pending\"})")
metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})")
metrics["pods_succeeded"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Succeeded\"})")
metrics["top_restarts_1h"] = _vm_vector(
f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
)
metrics["restart_namespace_top"] = _filter_namespace_vector(
_vm_vector(
f"topk(5, sum by (namespace) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
)
)
metrics["pod_cpu_top"] = _filter_namespace_vector(
_vm_vector(
f'topk(5, sum by (namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
)
)
metrics["pod_cpu_top_node"] = _filter_namespace_vector(
_vm_vector(
f'topk(5, sum by (node,namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]) * on (namespace,pod) group_left(node) kube_pod_info))'
)
)
metrics["pod_mem_top"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"}))"
)
)
metrics["pod_mem_top_node"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (node,namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"} * on (namespace,pod) group_left(node) kube_pod_info))"
)
)
metrics["job_failures_24h"] = _vm_vector(
"topk(5, sum by (namespace,job_name) (increase(kube_job_status_failed[24h])))"
)
except Exception as exc:
errors.append(f"vm: {exc}")
def _collect_node_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
metrics["postgres_connections"] = _postgres_connections(errors)
metrics["hottest_nodes"] = _hottest_nodes(errors)
metrics["node_usage"] = _node_usage(errors)
metrics["node_usage_stats"] = {
"cpu": _usage_stats(metrics.get("node_usage", {}).get("cpu", [])),
"ram": _usage_stats(metrics.get("node_usage", {}).get("ram", [])),
"net": _usage_stats(metrics.get("node_usage", {}).get("net", [])),
"io": _usage_stats(metrics.get("node_usage", {}).get("io", [])),
"disk": _usage_stats(metrics.get("node_usage", {}).get("disk", [])),
}
try:
node_exprs = _node_usage_exprs()
node_baseline_map: dict[str, dict[str, dict[str, float]]] = {}
for key, expr in node_exprs.items():
baseline = _vm_baseline_map(expr, "node", _BASELINE_WINDOW)
metrics.setdefault("node_baseline", {})[key] = _baseline_map_to_list(baseline, "node")
for name, stats in baseline.items():
node_baseline_map.setdefault(name, {})[key] = stats
metrics["node_baseline_map"] = node_baseline_map
except Exception as exc:
errors.append(f"baseline: {exc}")
def _collect_trend_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
try:
metrics["node_trends"] = _build_metric_trends(
_node_usage_exprs(),
"node",
"node",
_TREND_WINDOWS,
_TREND_NODE_LIMIT,
)
metrics["namespace_trends"] = _build_metric_trends(
_namespace_usage_exprs(),
"namespace",
"namespace",
_TREND_WINDOWS,
_TREND_NAMESPACE_LIMIT,
)
metrics["namespace_request_trends"] = _build_metric_trends(
_namespace_request_exprs(),
"namespace",
"namespace",
_TREND_WINDOWS,
_TREND_NAMESPACE_LIMIT,
)
metrics["restart_trends"] = {
window: _restart_namespace_trend(window) for window in _TREND_WINDOWS
}
metrics["job_failure_trends"] = {
window: _job_failure_trend(window) for window in _TREND_WINDOWS
}
metrics["pods_phase_trends"] = _pods_phase_trends()
metrics["pvc_usage_trends"] = _pvc_usage_trends()
metrics["pod_waiting_now"] = _pod_waiting_now()
metrics["pod_waiting_trends"] = _pod_waiting_trends()
metrics["pod_terminated_now"] = _pod_terminated_now()
metrics["pod_terminated_trends"] = _pod_terminated_trends()
metrics["cluster_trends"] = _cluster_trends()
metrics["node_condition_trends"] = _node_condition_trends()
metrics["pod_reason_totals"] = {
"waiting": _pod_reason_totals(
_POD_WAITING_REASONS,
"kube_pod_container_status_waiting_reason",
),
"terminated": _pod_reason_totals(
_POD_TERMINATED_REASONS,
"kube_pod_container_status_terminated_reason",
),
}
except Exception as exc:
errors.append(f"trends: {exc}")
def _collect_issue_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
try:
waiting_series = "kube_pod_container_status_waiting_reason"
terminated_series = "kube_pod_container_status_terminated_reason"
metrics["namespace_issue_top"] = {
"crash_loop": _namespace_reason_entries(
f'{waiting_series}{{reason="CrashLoopBackOff"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"image_pull": _namespace_reason_entries(
f'{waiting_series}{{reason="ImagePullBackOff"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"err_image_pull": _namespace_reason_entries(
f'{waiting_series}{{reason="ErrImagePull"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"config_error": _namespace_reason_entries(
f'{waiting_series}{{reason="CreateContainerConfigError"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"oom_killed": _namespace_reason_entries(
f'{terminated_series}{{reason="OOMKilled"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"terminated_error": _namespace_reason_entries(
f'{terminated_series}{{reason="Error"}}',
_NAMESPACE_ISSUE_LIMIT,
),
}
except Exception as exc:
errors.append(f"issues: {exc}")
def _collect_alert_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
try:
vm_now = _vm_alerts_now()
vm_trends = {window: _vm_alerts_trend(window) for window in _TREND_WINDOWS}
alertmanager_alerts = _alertmanager_alerts(errors)
metrics["alerts"] = {
"vm": {
"active": vm_now,
"active_total": len(vm_now),
},
"alertmanager": _summarize_alerts(alertmanager_alerts) if alertmanager_alerts else {},
"trends": vm_trends,
}
except Exception as exc:
errors.append(f"alerts: {exc}")
def _collect_namespace_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
try:
metrics["namespace_cpu_top"] = _filter_namespace_vector(
_vm_vector(
f'topk(5, sum by (namespace) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
)
)
metrics["namespace_mem_top"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (namespace) (container_memory_working_set_bytes{namespace!=\"\"}))"
)
)
metrics["namespace_cpu_requests_top"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (namespace) (kube_pod_container_resource_requests_cpu_cores))"
)
)
metrics["namespace_mem_requests_top"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (namespace) (kube_pod_container_resource_requests_memory_bytes))"
)
)
metrics["namespace_net_top"] = _filter_namespace_vector(
_vm_vector(
f"topk(5, sum by (namespace) (rate(container_network_receive_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}]) + rate(container_network_transmit_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}])))"
)
)
metrics["namespace_io_top"] = _filter_namespace_vector(
_vm_vector(
f"topk(5, sum by (namespace) (rate(container_fs_reads_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}]) + rate(container_fs_writes_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}])))"
)
)
namespace_cpu_usage = _vm_namespace_totals(
f'sum by (namespace) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]))'
)
namespace_cpu_requests = _vm_namespace_totals(
"sum by (namespace) (kube_pod_container_resource_requests_cpu_cores)"
)
namespace_mem_usage = _vm_namespace_totals(
'sum by (namespace) (container_memory_working_set_bytes{namespace!=""})'
)
namespace_mem_requests = _vm_namespace_totals(
"sum by (namespace) (kube_pod_container_resource_requests_memory_bytes)"
)
metrics["namespace_capacity"] = _build_namespace_capacity(
namespace_cpu_usage,
namespace_cpu_requests,
namespace_mem_usage,
namespace_mem_requests,
)
metrics["namespace_totals"] = {
"cpu": _namespace_totals_list(namespace_cpu_usage),
"mem": _namespace_totals_list(namespace_mem_usage),
"cpu_requests": _namespace_totals_list(namespace_cpu_requests),
"mem_requests": _namespace_totals_list(namespace_mem_requests),
}
except Exception as exc:
errors.append(f"namespace_usage: {exc}")
try:
namespace_exprs = _namespace_usage_exprs()
namespace_baseline_map: dict[str, dict[str, dict[str, float]]] = {}
for key, expr in namespace_exprs.items():
baseline = _vm_baseline_map(expr, "namespace", _BASELINE_WINDOW)
metrics.setdefault("namespace_baseline", {})[key] = _baseline_map_to_list(baseline, "namespace")
for name, stats in baseline.items():
namespace_baseline_map.setdefault(name, {})[key] = stats
metrics["namespace_baseline_map"] = namespace_baseline_map
except Exception as exc:
errors.append(f"baseline: {exc}")
metrics["namespace_capacity_summary"] = _namespace_capacity_summary(
metrics.get("namespace_capacity", []),
)
def _finalize_metrics(metrics: dict[str, Any]) -> None:
metrics["units"] = {
"cpu": "percent",
"ram": "percent",
"net": "bytes_per_sec",
"io": "bytes_per_sec",
"disk": "percent",
"restarts": "count",
"pod_cpu": "cores",
"pod_mem": "bytes",
"pod_cpu_top_node": "cores",
"pod_mem_top_node": "bytes",
"job_failures_24h": "count",
"namespace_cpu": "cores",
"namespace_mem": "bytes",
"namespace_cpu_requests": "cores",
"namespace_mem_requests": "bytes",
"namespace_net": "bytes_per_sec",
"namespace_io": "bytes_per_sec",
"pvc_used_percent": "percent",
"capacity_cpu": "cores",
"allocatable_cpu": "cores",
"capacity_mem_bytes": "bytes",
"allocatable_mem_bytes": "bytes",
"capacity_pods": "count",
"allocatable_pods": "count",
}
metrics["windows"] = {
"rates": _RATE_WINDOW,
"restarts": _RESTARTS_WINDOW,
"trend": _TREND_WINDOWS,
}
def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
metrics: dict[str, Any] = {}
_collect_vm_core(metrics, errors)
_collect_node_metrics(metrics, errors)
_collect_trend_metrics(metrics, errors)
_collect_alert_metrics(metrics, errors)
_collect_namespace_metrics(metrics, errors)
_collect_issue_metrics(metrics, errors)
metrics["pvc_usage_top"] = _pvc_usage(errors)
metrics["trend_summary"] = _trend_summary(metrics)
_finalize_metrics(metrics)
return metrics
def _trend_summary(metrics: dict[str, Any]) -> dict[str, Any]:
node_trends = metrics.get("node_trends", {}) if isinstance(metrics.get("node_trends"), dict) else {}
namespace_trends = (
metrics.get("namespace_trends", {}) if isinstance(metrics.get("namespace_trends"), dict) else {}
)
restarts = metrics.get("restart_trends", {}) if isinstance(metrics.get("restart_trends"), dict) else {}
job_failures = (
metrics.get("job_failure_trends", {}) if isinstance(metrics.get("job_failure_trends"), dict) else {}
)
summary: dict[str, Any] = {}
for metric_key, target in (("cpu", "node_cpu"), ("ram", "node_ram")):
metric_block = node_trends.get(metric_key, {}) if isinstance(node_trends.get(metric_key), dict) else {}
summary[target] = {
window: _limit_entries((metric_block.get(window) or {}).get("avg", []), 5)
for window in _TREND_WINDOWS
}
for metric_key, target in (("cpu", "namespace_cpu"), ("mem", "namespace_mem")):
metric_block = namespace_trends.get(metric_key, {}) if isinstance(namespace_trends.get(metric_key), dict) else {}
summary[target] = {
window: _limit_entries((metric_block.get(window) or {}).get("avg", []), 5)
for window in _TREND_WINDOWS
}
summary["restarts"] = {window: _limit_entries(entries or [], 5) for window, entries in restarts.items()}
summary["job_failures"] = {
window: _limit_entries(entries or [], 5) for window, entries in job_failures.items()
}
return summary
def _build_offenders(metrics: dict[str, Any]) -> dict[str, Any]:
offenders: dict[str, Any] = {}
offenders["pod_restarts_1h"] = _pod_restarts_top(metrics)
offenders["pod_waiting_now"] = metrics.get("pod_waiting_now") or {}
offenders["pod_terminated_now"] = metrics.get("pod_terminated_now") or {}
offenders["job_failures_24h"] = metrics.get("job_failures_24h") or []
offenders["pvc_pressure"] = _pvc_pressure_entries(metrics)
offenders["namespace_issues"] = metrics.get("namespace_issue_top") or {}
return offenders
def _namespace_totals_list(totals: dict[str, float]) -> list[dict[str, Any]]:
entries = [
{"namespace": name, "value": value}
for name, value in totals.items()
if isinstance(name, str) and name
]
entries.sort(key=lambda item: (-(item.get("value") or 0), item.get("namespace") or ""))
return entries
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,401 @@
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from .cluster_state_contract import *
def _node_usage_by_hardware(node_load: list[dict[str, Any]], node_details: list[dict[str, Any]]) -> list[dict[str, Any]]:
if not node_load or not node_details:
return []
hardware_by_node = _hardware_map(node_details)
buckets: dict[str, dict[str, list[float]]] = {}
for entry in node_load:
if not isinstance(entry, dict):
continue
node = entry.get("node")
if not isinstance(node, str) or not node:
continue
hardware = hardware_by_node.get(node, "unknown")
_append_hardware_usage(buckets, str(hardware), entry)
return _finalize_hardware_usage(buckets)
def _hardware_map(node_details: list[dict[str, Any]]) -> dict[str, str]:
mapping: dict[str, str] = {}
for node in node_details:
if not isinstance(node, dict):
continue
name = node.get("name")
if isinstance(name, str) and name:
mapping[name] = str(node.get("hardware") or "unknown")
return mapping
def _append_hardware_usage(buckets: dict[str, dict[str, list[float]]], hardware: str, entry: dict[str, Any]) -> None:
bucket = buckets.setdefault(hardware, {"load_index": [], "cpu": [], "ram": [], "net": [], "io": []})
for key in ("load_index", "cpu", "ram", "net", "io"):
value = entry.get(key)
if isinstance(value, (int, float)):
bucket[key].append(float(value))
def _finalize_hardware_usage(buckets: dict[str, dict[str, list[float]]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for hardware, metrics in buckets.items():
row: dict[str, Any] = {"hardware": hardware}
for key, values in metrics.items():
if values:
row[key] = sum(values) / len(values)
output.append(row)
output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("hardware") or ""))
return output
def _node_ready(conditions: Any) -> bool:
if not isinstance(conditions, list):
return False
for condition in conditions:
if not isinstance(condition, dict):
continue
if condition.get("type") == "Ready":
return condition.get("status") == "True"
return False
def _summarize_nodes(payload: dict[str, Any]) -> dict[str, Any]:
names: list[str] = []
not_ready: list[str] = []
for node in _items(payload):
metadata = node.get("metadata") if isinstance(node.get("metadata"), dict) else {}
status = node.get("status") if isinstance(node.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
if not name:
continue
names.append(name)
if not _node_ready(status.get("conditions")):
not_ready.append(name)
names.sort()
not_ready.sort()
total = len(names)
ready = total - len(not_ready)
return {
"total": total,
"ready": ready,
"not_ready": len(not_ready),
"names": names,
"not_ready_names": not_ready,
}
def _node_labels(labels: dict[str, Any]) -> dict[str, Any]:
if not isinstance(labels, dict):
return {}
keep: dict[str, Any] = {}
for key, value in labels.items():
if key.startswith("node-role.kubernetes.io/"):
keep[key] = value
if key in {
"kubernetes.io/arch",
"kubernetes.io/hostname",
"beta.kubernetes.io/arch",
"hardware",
"jetson",
}:
keep[key] = value
return keep
def _node_addresses(status: dict[str, Any]) -> dict[str, str]:
addresses = status.get("addresses") if isinstance(status.get("addresses"), list) else []
output: dict[str, str] = {}
for addr in addresses:
if not isinstance(addr, dict):
continue
addr_type = addr.get("type")
addr_value = addr.get("address")
if isinstance(addr_type, str) and isinstance(addr_value, str):
output[addr_type] = addr_value
return output
def _node_details(payload: dict[str, Any]) -> list[dict[str, Any]]:
details: list[dict[str, Any]] = []
for node in _items(payload):
metadata = node.get("metadata") if isinstance(node.get("metadata"), dict) else {}
spec = node.get("spec") if isinstance(node.get("spec"), dict) else {}
status = node.get("status") if isinstance(node.get("status"), dict) else {}
node_info = status.get("nodeInfo") if isinstance(status.get("nodeInfo"), dict) else {}
labels = metadata.get("labels") if isinstance(metadata.get("labels"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
if not name:
continue
roles = _node_roles(labels)
conditions = _node_pressure_conditions(status.get("conditions"))
created_at = metadata.get("creationTimestamp") if isinstance(metadata.get("creationTimestamp"), str) else ""
taints = _node_taints(spec.get("taints"))
details.append(
{
"name": name,
"ready": _node_ready(status.get("conditions")),
"roles": roles,
"is_worker": _node_is_worker(labels),
"labels": _node_labels(labels),
"hardware": _hardware_hint(labels, node_info),
"arch": node_info.get("architecture") or "",
"os": node_info.get("operatingSystem") or "",
"kernel": node_info.get("kernelVersion") or "",
"kubelet": node_info.get("kubeletVersion") or "",
"container_runtime": node_info.get("containerRuntimeVersion") or "",
"addresses": _node_addresses(status),
"created_at": created_at,
"age_hours": _age_hours(created_at),
"taints": taints,
"unschedulable": bool(spec.get("unschedulable")),
"capacity": _node_capacity(status.get("capacity")),
"allocatable": _node_capacity(status.get("allocatable")),
"pressure": conditions,
}
)
details.sort(key=lambda item: item.get("name") or "")
return details
def _age_hours(timestamp: str) -> float | None:
if not timestamp:
return None
try:
parsed = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
except ValueError:
return None
return round((datetime.now(timezone.utc) - parsed).total_seconds() / 3600, 1)
def _node_age_stats(details: list[dict[str, Any]]) -> dict[str, Any]:
ages: list[tuple[str, float]] = []
for node in details:
name = node.get("name") if isinstance(node, dict) else ""
age = node.get("age_hours")
if isinstance(name, str) and name and isinstance(age, (int, float)):
ages.append((name, float(age)))
if not ages:
return {}
ages.sort(key=lambda item: item[1])
values = [age for _, age in ages]
return {
"min": round(min(values), 1),
"max": round(max(values), 1),
"avg": round(sum(values) / len(values), 1),
"youngest": [{"name": name, "age_hours": age} for name, age in ages[:5]],
"oldest": [{"name": name, "age_hours": age} for name, age in ages[-5:]],
}
def _node_flagged(details: list[dict[str, Any]], key: str) -> list[str]:
names: list[str] = []
for node in details:
name = node.get("name") if isinstance(node, dict) else ""
if not isinstance(name, str) or not name:
continue
if node.get(key):
names.append(name)
names.sort()
return names
def _node_taints(raw: Any) -> list[dict[str, str]]:
if not isinstance(raw, list):
return []
taints: list[dict[str, str]] = []
for entry in raw:
if not isinstance(entry, dict):
continue
key = entry.get("key")
effect = entry.get("effect")
value = entry.get("value")
if isinstance(key, str) and isinstance(effect, str):
taints.append(
{
"key": key,
"value": value if isinstance(value, str) else "",
"effect": effect,
}
)
return taints
def _summarize_inventory(details: list[dict[str, Any]]) -> dict[str, Any]:
summary = {
"total": 0,
"ready": 0,
"workers": {"total": 0, "ready": 0},
"by_hardware": {},
"by_arch": {},
"by_role": {},
"not_ready_names": [],
"pressure_nodes": {key: [] for key in _PRESSURE_TYPES},
"age_stats": {},
"tainted_nodes": [],
"unschedulable_nodes": [],
}
not_ready: list[str] = []
for node in details:
name = _apply_node_summary(summary, node)
if name and not node.get("ready"):
not_ready.append(name)
not_ready.sort()
summary["not_ready_names"] = not_ready
for cond_type in summary["pressure_nodes"]:
summary["pressure_nodes"][cond_type].sort()
summary["age_stats"] = _node_age_stats(details)
summary["tainted_nodes"] = _node_flagged(details, "taints")
summary["unschedulable_nodes"] = _node_flagged(details, "unschedulable")
return summary
def _hardware_groups(details: list[dict[str, Any]]) -> list[dict[str, Any]]:
groups: dict[str, list[str]] = {}
for node in details:
if not isinstance(node, dict):
continue
name = node.get("name")
if not isinstance(name, str) or not name:
continue
hardware = str(node.get("hardware") or "unknown")
groups.setdefault(hardware, []).append(name)
output: list[dict[str, Any]] = []
for hardware, nodes in groups.items():
nodes.sort()
output.append({"hardware": hardware, "count": len(nodes), "nodes": nodes})
output.sort(key=lambda item: (-(item.get("count") or 0), item.get("hardware") or ""))
return output
def _pressure_summary(nodes_summary: dict[str, Any]) -> dict[str, Any]:
pressure_nodes = nodes_summary.get("pressure_nodes") if isinstance(nodes_summary, dict) else {}
summary: dict[str, Any] = {"by_type": {}, "total": 0}
if isinstance(pressure_nodes, dict):
for cond, names in pressure_nodes.items():
count = len(names) if isinstance(names, list) else 0
summary["by_type"][cond] = count
summary["total"] += count
unschedulable = nodes_summary.get("unschedulable_nodes") or []
summary["unschedulable"] = len(unschedulable) if isinstance(unschedulable, list) else 0
return summary
def _apply_node_summary(summary: dict[str, Any], node: dict[str, Any]) -> str:
name = node.get("name") if isinstance(node, dict) else ""
if not isinstance(name, str) or not name:
return ""
summary["total"] += 1
ready = bool(node.get("ready"))
if ready:
summary["ready"] += 1
if node.get("is_worker"):
summary["workers"]["total"] += 1
if ready:
summary["workers"]["ready"] += 1
hardware = node.get("hardware") or "unknown"
arch = node.get("arch") or "unknown"
summary["by_hardware"][hardware] = summary["by_hardware"].get(hardware, 0) + 1
summary["by_arch"][arch] = summary["by_arch"].get(arch, 0) + 1
for role in node.get("roles") or []:
summary["by_role"][role] = summary["by_role"].get(role, 0) + 1
_apply_pressure(summary, node, name)
return name
def _apply_pressure(summary: dict[str, Any], node: dict[str, Any], name: str) -> None:
pressure = node.get("pressure") or {}
if not isinstance(pressure, dict):
return
for cond_type, active in pressure.items():
if active and cond_type in summary["pressure_nodes"]:
summary["pressure_nodes"][cond_type].append(name)
def _node_capacity(raw: Any) -> dict[str, str]:
if not isinstance(raw, dict):
return {}
output: dict[str, str] = {}
for key in _CAPACITY_KEYS:
value = raw.get(key)
if isinstance(value, (str, int, float)) and value != "":
output[key] = str(value)
return output
def _node_pressure_conditions(conditions: Any) -> dict[str, bool]:
if not isinstance(conditions, list):
return {}
pressure: dict[str, bool] = {}
for condition in conditions:
if not isinstance(condition, dict):
continue
cond_type = condition.get("type")
if cond_type in _PRESSURE_TYPES:
pressure[cond_type] = condition.get("status") == "True"
return pressure
def _node_roles(labels: dict[str, Any]) -> list[str]:
roles: list[str] = []
for key in labels.keys():
if key.startswith("node-role.kubernetes.io/"):
role = key.split("/", 1)[-1]
if role:
roles.append(role)
return sorted(set(roles))
def _node_is_worker(labels: dict[str, Any]) -> bool:
if "node-role.kubernetes.io/control-plane" in labels:
return False
if "node-role.kubernetes.io/master" in labels:
return False
if "node-role.kubernetes.io/worker" in labels:
return True
return True
def _hardware_hint(labels: dict[str, Any], node_info: dict[str, Any]) -> str:
result = "unknown"
if str(labels.get("jetson") or "").lower() == "true":
result = "jetson"
else:
hardware = (labels.get("hardware") or "").strip().lower()
if hardware:
result = hardware
else:
kernel = str(node_info.get("kernelVersion") or "").lower()
os_image = str(node_info.get("osImage") or "").lower()
if "tegra" in kernel or "jetson" in os_image:
result = "jetson"
elif "raspi" in kernel or "bcm2711" in kernel:
result = "rpi"
else:
arch = str(node_info.get("architecture") or "").lower()
if arch == "amd64":
result = "amd64"
elif arch == "arm64":
result = "arm64-unknown"
return result
def _condition_status(conditions: Any, cond_type: str) -> tuple[bool | None, str, str]:
if not isinstance(conditions, list):
return None, "", ""
for condition in conditions:
if not isinstance(condition, dict):
continue
if condition.get("type") != cond_type:
continue
status = condition.get("status")
if status == "True":
return True, condition.get("reason") or "", condition.get("message") or ""
if status == "False":
return False, condition.get("reason") or "", condition.get("message") or ""
return None, condition.get("reason") or "", condition.get("message") or ""
return None, "", ""
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,340 @@
from __future__ import annotations
from typing import Any
from .cluster_state_contract import *
from .cluster_state_flux_events import *
from .cluster_state_nodes import *
def _workload_from_labels(labels: dict[str, Any]) -> tuple[str, str]:
for key in _WORKLOAD_LABEL_KEYS:
value = labels.get(key)
if isinstance(value, str) and value:
return value, f"label:{key}"
return "", ""
def _owner_reference(metadata: dict[str, Any]) -> tuple[str, str]:
owners = metadata.get("ownerReferences") if isinstance(metadata.get("ownerReferences"), list) else []
for owner in owners:
if not isinstance(owner, dict):
continue
name = owner.get("name")
kind = owner.get("kind")
if isinstance(name, str) and name:
return name, f"owner:{kind or 'unknown'}"
return "", ""
def _pod_workload(meta: dict[str, Any]) -> tuple[str, str]:
labels = meta.get("labels") if isinstance(meta.get("labels"), dict) else {}
name, source = _workload_from_labels(labels)
if name:
return name, source
return _owner_reference(meta)
def _summarize_workloads(payload: dict[str, Any]) -> list[dict[str, Any]]:
workloads: dict[tuple[str, str], dict[str, Any]] = {}
for pod in _items(payload):
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
if not _namespace_allowed(namespace):
continue
workload, source = _pod_workload(metadata)
if not workload:
continue
node = spec.get("nodeName") if isinstance(spec.get("nodeName"), str) else ""
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
key = (namespace, workload)
entry = workloads.setdefault(
key,
{
"namespace": namespace,
"workload": workload,
"source": source,
"nodes": {},
"pods_total": 0,
"pods_running": 0,
},
)
entry["pods_total"] += 1
if phase == "Running":
entry["pods_running"] += 1
if node:
nodes = entry["nodes"]
nodes[node] = nodes.get(node, 0) + 1
output: list[dict[str, Any]] = []
for entry in workloads.values():
nodes = entry.get("nodes") or {}
primary = ""
if isinstance(nodes, dict) and nodes:
primary = sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[0][0]
entry["primary_node"] = primary
output.append(entry)
output.sort(key=lambda item: (item.get("namespace") or "", item.get("workload") or ""))
return output
def _summarize_namespace_pods(payload: dict[str, Any]) -> list[dict[str, Any]]:
namespaces: dict[str, dict[str, Any]] = {}
for pod in _items(payload):
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
if not _namespace_allowed(namespace):
continue
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
entry = namespaces.setdefault(
namespace,
{
"namespace": namespace,
"pods_total": 0,
"pods_running": 0,
"pods_pending": 0,
"pods_failed": 0,
"pods_succeeded": 0,
},
)
entry["pods_total"] += 1
if phase == "Running":
entry["pods_running"] += 1
elif phase == "Pending":
entry["pods_pending"] += 1
elif phase == "Failed":
entry["pods_failed"] += 1
elif phase == "Succeeded":
entry["pods_succeeded"] += 1
output = list(namespaces.values())
output.sort(key=lambda item: (-item.get("pods_total", 0), item.get("namespace") or ""))
return output
def _summarize_namespace_nodes(payload: dict[str, Any]) -> list[dict[str, Any]]:
namespaces: dict[str, dict[str, Any]] = {}
for pod in _items(payload):
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
if not _namespace_allowed(namespace):
continue
node = spec.get("nodeName") if isinstance(spec.get("nodeName"), str) else ""
if not node:
continue
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
entry = namespaces.setdefault(
namespace,
{
"namespace": namespace,
"pods_total": 0,
"pods_running": 0,
"nodes": {},
},
)
entry["pods_total"] += 1
if phase == "Running":
entry["pods_running"] += 1
nodes = entry["nodes"]
nodes[node] = nodes.get(node, 0) + 1
output: list[dict[str, Any]] = []
for entry in namespaces.values():
nodes = entry.get("nodes") or {}
primary = ""
if isinstance(nodes, dict) and nodes:
primary = sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[0][0]
entry["primary_node"] = primary
output.append(entry)
output.sort(key=lambda item: (-item.get("pods_total", 0), item.get("namespace") or ""))
return output
_NODE_PHASE_KEYS = {
"Running": "pods_running",
"Pending": "pods_pending",
"Failed": "pods_failed",
"Succeeded": "pods_succeeded",
}
def _summarize_node_pods(payload: dict[str, Any]) -> list[dict[str, Any]]:
nodes: dict[str, dict[str, Any]] = {}
for pod in _items(payload):
context = _node_pod_context(pod)
if not context:
continue
node, namespace, phase = context
entry = _node_pod_entry(nodes, node)
_node_pod_apply(entry, namespace, phase)
return _node_pod_finalize(nodes)
def _node_pod_context(pod: dict[str, Any]) -> tuple[str, str, str] | None:
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
if not _namespace_allowed(namespace):
return None
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
node = spec.get("nodeName") if isinstance(spec.get("nodeName"), str) else ""
if not node:
return None
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
return node, namespace, phase
def _node_pod_entry(nodes: dict[str, dict[str, Any]], node: str) -> dict[str, Any]:
return nodes.setdefault(
node,
{
"node": node,
"pods_total": 0,
"pods_running": 0,
"pods_pending": 0,
"pods_failed": 0,
"pods_succeeded": 0,
"namespaces": {},
},
)
def _node_pod_apply(entry: dict[str, Any], namespace: str, phase: str) -> None:
entry["pods_total"] += 1
phase_key = _NODE_PHASE_KEYS.get(phase)
if phase_key:
entry[phase_key] += 1
if namespace:
namespaces = entry["namespaces"]
namespaces[namespace] = namespaces.get(namespace, 0) + 1
def _node_pod_finalize(nodes: dict[str, dict[str, Any]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in nodes.values():
namespaces = entry.get("namespaces") or {}
if isinstance(namespaces, dict):
entry["namespaces_top"] = sorted(
namespaces.items(), key=lambda item: (-item[1], item[0])
)[:3]
output.append(entry)
output.sort(key=lambda item: (-item.get("pods_total", 0), item.get("node") or ""))
return output
def _node_pods_top(node_pods: list[dict[str, Any]], limit: int = 5) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in node_pods[:limit]:
if not isinstance(entry, dict):
continue
output.append(
{
"node": entry.get("node"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"namespaces_top": entry.get("namespaces_top") or [],
}
)
return output
def _record_pending_pod(pending_oldest: list[dict[str, Any]], info: dict[str, Any]) -> bool:
age_hours = info.get("age_hours")
if age_hours is None:
return False
pending_oldest.append(info)
return age_hours >= _PENDING_15M_HOURS
def _update_pod_issue(pod: dict[str, Any], acc: dict[str, Any]) -> None:
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
created_at = (
metadata.get("creationTimestamp")
if isinstance(metadata.get("creationTimestamp"), str)
else ""
)
age_hours = _age_hours(created_at)
if not name or not namespace:
return
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
restarts = 0
waiting_reasons: list[str] = []
for container in status.get("containerStatuses") or []:
if not isinstance(container, dict):
continue
restarts += int(container.get("restartCount") or 0)
state = container.get("state") if isinstance(container.get("state"), dict) else {}
waiting = state.get("waiting") if isinstance(state.get("waiting"), dict) else {}
reason = waiting.get("reason")
if isinstance(reason, str) and reason:
waiting_reasons.append(reason)
acc["waiting_reasons"][reason] = acc["waiting_reasons"].get(reason, 0) + 1
phase_reason = status.get("reason")
if isinstance(phase_reason, str) and phase_reason:
acc["phase_reasons"][phase_reason] = acc["phase_reasons"].get(phase_reason, 0) + 1
if phase in acc["counts"]:
acc["counts"][phase] += 1
if phase in _PHASE_SEVERITY or restarts > 0:
acc["items"].append(
{
"namespace": namespace,
"pod": name,
"node": spec.get("nodeName") or "",
"phase": phase,
"reason": status.get("reason") or "",
"restarts": restarts,
"waiting_reasons": sorted(set(waiting_reasons)),
"created_at": created_at,
"age_hours": age_hours,
}
)
if phase == "Pending":
info = {
"namespace": namespace,
"pod": name,
"node": spec.get("nodeName") or "",
"age_hours": age_hours,
"reason": status.get("reason") or "",
}
if _record_pending_pod(acc["pending_oldest"], info):
acc["pending_over_15m"] += 1
def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
acc = {
"items": [],
"counts": {key: 0 for key in _PHASE_SEVERITY},
"pending_oldest": [],
"pending_over_15m": 0,
"waiting_reasons": {},
"phase_reasons": {},
}
for pod in _items(payload):
if isinstance(pod, dict):
_update_pod_issue(pod, acc)
items = acc["items"]
items.sort(
key=lambda item: (
-_PHASE_SEVERITY.get(item.get("phase") or "", 0),
-(item.get("restarts") or 0),
item.get("namespace") or "",
item.get("pod") or "",
)
)
pending_oldest = acc["pending_oldest"]
pending_oldest.sort(key=lambda item: -(item.get("age_hours") or 0.0))
return {
"counts": acc["counts"],
"items": items[:20],
"pending_oldest": pending_oldest[:10],
"pending_over_15m": acc["pending_over_15m"],
"waiting_reasons": acc["waiting_reasons"],
"phase_reasons": acc["phase_reasons"],
}
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,104 @@
from __future__ import annotations
from typing import Any
from .cluster_state_contract import *
ProfileRows = list[dict[str, Any]]
NodeWorkloadMap = dict[str, dict[str, int]]
def _node_profiles(node_context: ProfileRows, node_pods: ProfileRows, node_workloads: NodeWorkloadMap) -> ProfileRows:
pod_map = {entry.get("node"): entry for entry in node_pods if isinstance(entry, dict)}
workload_map = node_workloads or {}
profiles: list[dict[str, Any]] = []
for entry in node_context:
if not isinstance(entry, dict):
continue
node = entry.get("node")
if not isinstance(node, str) or not node:
continue
pods = pod_map.get(node, {})
workloads = workload_map.get(node, {})
workloads_top = sorted(workloads.items(), key=lambda item: (-item[1], item[0]))[:_NODE_WORKLOAD_TOP]
profiles.append(
{
"node": node,
"ready": entry.get("ready"),
"hardware": entry.get("hardware"),
"arch": entry.get("arch"),
"roles": entry.get("roles"),
"pods_total": pods.get("pods_total"),
"pods_running": pods.get("pods_running"),
"namespaces_top": pods.get("namespaces_top") or [],
"workloads_top": workloads_top,
"load_index": entry.get("load_index"),
"cpu": entry.get("cpu"),
"ram": entry.get("ram"),
"net": entry.get("net"),
"io": entry.get("io"),
"disk": entry.get("disk"),
"baseline_delta": entry.get("baseline_delta") or {},
}
)
profiles.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("node") or ""))
return profiles[:_PROFILE_LIMIT]
def _namespace_profiles(namespace_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
entries = [entry for entry in namespace_context if isinstance(entry, dict)]
entries.sort(key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or ""))
output: list[dict[str, Any]] = []
for entry in entries[:_PROFILE_LIMIT]:
output.append(
{
"namespace": entry.get("namespace"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"primary_node": entry.get("primary_node"),
"nodes_top": entry.get("nodes_top") or [],
"cpu_usage": entry.get("cpu_usage"),
"mem_usage": entry.get("mem_usage"),
"cpu_ratio": entry.get("cpu_ratio"),
"mem_ratio": entry.get("mem_ratio"),
"baseline_delta": entry.get("baseline_delta") or {},
}
)
return output
def _workload_profiles(workloads: list[dict[str, Any]]) -> list[dict[str, Any]]:
entries = [entry for entry in workloads if isinstance(entry, dict)]
entries.sort(
key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or "", item.get("workload") or ""),
)
output: list[dict[str, Any]] = []
for entry in entries[:_PROFILE_LIMIT]:
nodes = entry.get("nodes")
nodes_top = (
sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[:3]
if isinstance(nodes, dict)
else []
)
output.append(
{
"namespace": entry.get("namespace"),
"workload": entry.get("workload"),
"source": entry.get("source"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"primary_node": entry.get("primary_node"),
"nodes_top": nodes_top,
}
)
return output
def _build_profiles(node_context: ProfileRows, namespace_context: ProfileRows, node_pods: ProfileRows, workloads: ProfileRows, node_workloads: NodeWorkloadMap) -> dict[str, Any]:
return {
"nodes": _node_profiles(node_context, node_pods, node_workloads),
"namespaces": _namespace_profiles(namespace_context),
"workloads": _workload_profiles(workloads),
}
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,429 @@
from __future__ import annotations
from typing import Any
from .cluster_state_contract import *
def _vector_to_named(entries: list[dict[str, Any]], label_key: str, name_key: str) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
value = item.get("value")
label = metric.get(label_key) if isinstance(metric, dict) else None
if not isinstance(label, str) or not label:
continue
output.append({name_key: label, "value": value, "metric": metric})
output.sort(key=lambda item: (-(item.get("value") or 0), item.get(name_key) or ""))
return output
def _pvc_top(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in entries:
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
pvc = metric.get("persistentvolumeclaim")
if not isinstance(namespace, str) or not isinstance(pvc, str):
continue
output.append(
{
"namespace": namespace,
"pvc": pvc,
"used_percent": item.get("value"),
}
)
output.sort(key=lambda item: (-(item.get("used_percent") or 0), item.get("namespace") or ""))
return output
def _namespace_context(namespace_pods: list[dict[str, Any]], namespace_nodes: list[dict[str, Any]], namespace_capacity: list[dict[str, Any]], namespace_baseline: dict[str, dict[str, dict[str, float]]]) -> list[dict[str, Any]]:
node_map = {entry.get("namespace"): entry for entry in namespace_nodes if isinstance(entry, dict)}
cap_map = {entry.get("namespace"): entry for entry in namespace_capacity if isinstance(entry, dict)}
output: list[dict[str, Any]] = []
for entry in namespace_pods:
if not isinstance(entry, dict):
continue
namespace = entry.get("namespace")
if not isinstance(namespace, str) or not namespace:
continue
nodes_entry = node_map.get(namespace, {})
cap_entry = cap_map.get(namespace, {})
nodes = nodes_entry.get("nodes") if isinstance(nodes_entry.get("nodes"), dict) else {}
top_nodes: list[dict[str, Any]] = []
if isinstance(nodes, dict):
top_nodes = [
{"node": name, "pods": count}
for name, count in sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[:3]
]
baseline = namespace_baseline.get(namespace, {}) if isinstance(namespace_baseline, dict) else {}
delta_cpu = _baseline_delta(cap_entry.get("cpu_usage"), baseline.get("cpu", {}))
delta_mem = _baseline_delta(cap_entry.get("mem_usage"), baseline.get("mem", {}))
baseline_delta = {k: v for k, v in (("cpu", delta_cpu), ("mem", delta_mem)) if v is not None}
output.append(
{
"namespace": namespace,
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"pods_pending": entry.get("pods_pending"),
"pods_failed": entry.get("pods_failed"),
"pods_succeeded": entry.get("pods_succeeded"),
"primary_node": nodes_entry.get("primary_node"),
"nodes_top": top_nodes,
"cpu_usage": cap_entry.get("cpu_usage"),
"cpu_requests": cap_entry.get("cpu_requests"),
"cpu_ratio": cap_entry.get("cpu_usage_ratio"),
"mem_usage": cap_entry.get("mem_usage"),
"mem_requests": cap_entry.get("mem_requests"),
"mem_ratio": cap_entry.get("mem_usage_ratio"),
"baseline_delta": baseline_delta,
}
)
output.sort(key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or ""))
return output
def _namespace_nodes_top(namespace_context: list[dict[str, Any]], limit: int = 5) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in namespace_context[:limit]:
if not isinstance(entry, dict):
continue
output.append(
{
"namespace": entry.get("namespace"),
"pods_total": entry.get("pods_total"),
"primary_node": entry.get("primary_node"),
"nodes_top": entry.get("nodes_top") or [],
}
)
return output
def _workload_nodes_top(workloads: list[dict[str, Any]], limit: int = 5) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
entries = [w for w in workloads if isinstance(w, dict)]
entries.sort(
key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or "", item.get("workload") or ""),
)
for entry in entries[:limit]:
output.append(
{
"namespace": entry.get("namespace"),
"workload": entry.get("workload"),
"source": entry.get("source"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"primary_node": entry.get("primary_node"),
}
)
return output
def _node_workload_map(workloads: list[dict[str, Any]]) -> dict[str, dict[str, int]]:
mapping: dict[str, dict[str, int]] = {}
for entry in workloads:
if not isinstance(entry, dict):
continue
namespace = entry.get("namespace")
workload = entry.get("workload")
if not isinstance(workload, str) or not workload:
continue
nodes = entry.get("nodes")
if not isinstance(nodes, dict):
continue
key = f"{namespace}/{workload}" if isinstance(namespace, str) and namespace else workload
for node, count in nodes.items():
if not isinstance(node, str) or not node:
continue
if not isinstance(count, int):
try:
count = int(count)
except (TypeError, ValueError):
continue
if count <= 0:
continue
mapping.setdefault(node, {})[key] = mapping.setdefault(node, {}).get(key, 0) + count
return mapping
def _node_workloads_top(workload_map: dict[str, dict[str, int]], limit_nodes: int = _NODE_WORKLOAD_LIMIT, limit_workloads: int = _NODE_WORKLOAD_TOP) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for node, workloads in workload_map.items():
if not isinstance(node, str) or not node or not isinstance(workloads, dict):
continue
total = sum(count for count in workloads.values() if isinstance(count, int))
top = sorted(workloads.items(), key=lambda item: (-item[1], item[0]))[:limit_workloads]
output.append({"node": node, "pods_total": total, "workloads_top": top})
output.sort(key=lambda item: (-(item.get("pods_total") or 0), item.get("node") or ""))
return output[:limit_nodes]
def _workload_index(workloads: list[dict[str, Any]], limit: int = _WORKLOAD_INDEX_LIMIT) -> list[dict[str, Any]]:
entries = [entry for entry in workloads if isinstance(entry, dict)]
entries.sort(
key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or "", item.get("workload") or ""),
)
output: list[dict[str, Any]] = []
for entry in entries[:limit]:
nodes = entry.get("nodes") if isinstance(entry.get("nodes"), dict) else {}
nodes_top = (
sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[:_NODE_WORKLOAD_TOP]
if isinstance(nodes, dict)
else []
)
output.append(
{
"namespace": entry.get("namespace"),
"workload": entry.get("workload"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"primary_node": entry.get("primary_node"),
"nodes_top": nodes_top,
}
)
return output
def _events_summary(events: dict[str, Any]) -> dict[str, Any]:
if not isinstance(events, dict):
return {}
by_namespace = events.get("warnings_by_namespace") if isinstance(events.get("warnings_by_namespace"), dict) else {}
top_namespace = ""
top_namespace_count = 0
if by_namespace:
top_namespace, top_namespace_count = sorted(
by_namespace.items(), key=lambda item: (-item[1], item[0])
)[0]
return {
"warnings_total": events.get("warnings_total"),
"top_reason": events.get("warnings_top_reason"),
"top_namespace": {"namespace": top_namespace, "count": top_namespace_count},
"latest": events.get("warnings_latest"),
"recent": (events.get("warnings_recent") or [])[:_EVENTS_SUMMARY_LIMIT],
}
def _build_lexicon() -> dict[str, Any]:
terms = [
{
"term": "hottest",
"meaning": "highest utilization for a metric (cpu, ram, net, io, load_index).",
},
{
"term": "pressure",
"meaning": "node condition flags (MemoryPressure, DiskPressure, PIDPressure, NetworkUnavailable).",
},
{
"term": "load_index",
"meaning": "composite load score derived from cpu, ram, net, io.",
},
{"term": "top", "meaning": "highest values within a category."},
{"term": "pods", "meaning": "running workload instances on a node or namespace."},
{"term": "workload", "meaning": "deployment/statefulset/daemonset grouping."},
]
aliases = {
"hot node": "node with highest load_index",
"hottest by cpu": "node with highest cpu utilization",
"hottest by ram": "node with highest ram utilization",
"pressure node": "node with pressure condition flags",
}
return {"terms": terms, "aliases": aliases}
def _top_named_entries(entries: list[dict[str, Any]], name_key: str, limit: int) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in entries or []:
if not isinstance(entry, dict):
continue
name = entry.get(name_key)
if not isinstance(name, str) or not name:
continue
value = entry.get("value")
try:
numeric = float(value)
except (TypeError, ValueError):
numeric = 0.0
output.append({"name": name, "value": numeric})
output.sort(key=lambda item: -(item.get("value") or 0))
return output[:limit]
def _cross_node_metric_top(metrics: dict[str, Any], node_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
node_map = {entry.get("node"): entry for entry in node_context if isinstance(entry, dict)}
output: list[dict[str, Any]] = []
for metric in ("cpu", "ram", "net", "io", "disk"):
series = usage.get(metric)
if not isinstance(series, list):
continue
for top in _top_named_entries(series, "node", _CROSS_NODE_TOP):
node = top.get("name")
if not node:
continue
context = node_map.get(node, {})
output.append(
{
"metric": metric,
"node": node,
"value": top.get("value"),
"cpu": context.get("cpu"),
"ram": context.get("ram"),
"net": context.get("net"),
"io": context.get("io"),
"disk": context.get("disk"),
"load_index": context.get("load_index"),
"pods_total": context.get("pods_total"),
"hardware": context.get("hardware"),
"roles": context.get("roles"),
"pressure_flags": context.get("pressure_flags"),
}
)
return output
def _cross_namespace_metric_top(metrics: dict[str, Any], namespace_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
top = metrics.get("namespace_top") if isinstance(metrics.get("namespace_top"), dict) else {}
namespace_map = {
entry.get("namespace"): entry
for entry in namespace_context
if isinstance(entry, dict) and entry.get("namespace")
}
output: list[dict[str, Any]] = []
for metric in ("cpu", "mem", "net", "io", "restarts"):
series = top.get(metric)
if not isinstance(series, list):
continue
for entry in _top_named_entries(series, "namespace", _CROSS_NAMESPACE_TOP):
namespace = entry.get("name")
if not namespace:
continue
context = namespace_map.get(namespace, {})
output.append(
{
"metric": metric,
"namespace": namespace,
"value": entry.get("value"),
"pods_total": context.get("pods_total"),
"pods_running": context.get("pods_running"),
"cpu_ratio": context.get("cpu_ratio"),
"mem_ratio": context.get("mem_ratio"),
"primary_node": context.get("primary_node"),
"nodes_top": context.get("nodes_top") or [],
}
)
return output
def _build_cross_stats(metrics: dict[str, Any], node_context: list[dict[str, Any]], namespace_context: list[dict[str, Any]], workloads: list[dict[str, Any]]) -> dict[str, Any]:
return {
"node_metric_top": _cross_node_metric_top(metrics, node_context),
"namespace_metric_top": _cross_namespace_metric_top(metrics, namespace_context),
"pvc_top": _pvc_top(metrics.get("pvc_usage_top", []))[:_CROSS_PVC_TOP],
"workload_top": _workload_nodes_top(workloads, _CROSS_NAMESPACE_TOP),
}
def _node_context(node_details: list[dict[str, Any]], node_load: list[dict[str, Any]], node_baseline: dict[str, dict[str, dict[str, float]]], node_workloads: dict[str, dict[str, int]]) -> list[dict[str, Any]]:
load_map = {entry.get("node"): entry for entry in node_load if isinstance(entry, dict)}
output: list[dict[str, Any]] = []
for entry in node_details:
if not isinstance(entry, dict):
continue
name = entry.get("name")
if not isinstance(name, str) or not name:
continue
load_entry = load_map.get(name, {})
baseline = node_baseline.get(name, {}) if isinstance(node_baseline, dict) else {}
deltas: dict[str, float] = {}
for key in ("cpu", "ram", "net", "io", "disk"):
current = load_entry.get(key)
stats = baseline.get(key, {}) if isinstance(baseline, dict) else {}
delta = _baseline_delta(current, stats)
if delta is not None:
deltas[key] = delta
workloads = node_workloads.get(name, {}) if isinstance(node_workloads, dict) else {}
workloads_top = sorted(workloads.items(), key=lambda item: (-item[1], item[0]))[:_NODE_WORKLOAD_TOP]
output.append(
{
"node": name,
"ready": entry.get("ready"),
"roles": entry.get("roles"),
"is_worker": entry.get("is_worker"),
"hardware": entry.get("hardware"),
"arch": entry.get("arch"),
"os": entry.get("os"),
"taints": entry.get("taints"),
"unschedulable": entry.get("unschedulable"),
"pressure_flags": entry.get("pressure"),
"pods_total": load_entry.get("pods_total"),
"cpu": load_entry.get("cpu"),
"ram": load_entry.get("ram"),
"disk": load_entry.get("disk"),
"net": load_entry.get("net"),
"io": load_entry.get("io"),
"load_index": load_entry.get("load_index"),
"baseline": baseline,
"baseline_delta": deltas,
"workloads_top": workloads_top,
}
)
output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("node") or ""))
return output
def _baseline_delta(current: Any, stats: dict[str, Any]) -> float | None:
if not isinstance(current, (int, float)):
return None
avg = stats.get("avg")
if not isinstance(avg, (int, float)) or avg == 0:
return None
return round(((float(current) - float(avg)) / float(avg)) * 100, 2)
def _delta_severity(delta: float) -> str:
magnitude = abs(delta)
if magnitude >= _BASELINE_DELTA_CRIT:
return "critical"
if magnitude >= _BASELINE_DELTA_WARN:
return "warning"
return "info"
def _delta_entry_label(entry: dict[str, Any]) -> tuple[str, str]:
if "node" in entry:
return ("node", str(entry.get("node") or ""))
return ("namespace", str(entry.get("namespace") or ""))
def _delta_top(entries: list[dict[str, Any]], key: str, limit: int = _DELTA_TOP_LIMIT) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in entries:
if not isinstance(entry, dict):
continue
deltas = entry.get("baseline_delta") if isinstance(entry.get("baseline_delta"), dict) else {}
delta = deltas.get(key)
if not isinstance(delta, (int, float)):
continue
label_key, label_value = _delta_entry_label(entry)
output.append(
{
label_key: label_value,
"metric": key,
"delta": delta,
"severity": _delta_severity(float(delta)),
}
)
output.sort(key=lambda item: (-(abs(item.get("delta") or 0)), item.get("metric") or ""))
return output[:limit]
def _reason_top(counts: dict[str, Any], limit: int = _REASON_TOP_LIMIT) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for reason, value in counts.items() if isinstance(counts, dict) else []:
if isinstance(reason, str) and reason and isinstance(value, (int, float)):
output.append({"reason": reason, "count": int(value)})
output.sort(key=lambda item: (-item.get("count", 0), item.get("reason") or ""))
return output[:limit]
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,160 @@
from __future__ import annotations
from typing import Any
from .cluster_state_anomalies import *
from .cluster_state_contract import *
from .cluster_state_health import *
from .cluster_state_relationships import *
def _pod_issue_summary(pod_issues: dict[str, Any], metrics: dict[str, Any]) -> dict[str, Any]:
waiting = pod_issues.get("waiting_reasons") if isinstance(pod_issues, dict) else {}
phase = pod_issues.get("phase_reasons") if isinstance(pod_issues, dict) else {}
return {
"waiting_reasons_top": _reason_top(waiting),
"phase_reasons_top": _reason_top(phase),
"namespace_issue_top": metrics.get("namespace_issue_top") or {},
}
def _delta_hit(delta: Any) -> bool:
if not isinstance(delta, (int, float)):
return False
return abs(float(delta)) >= _BASELINE_DELTA_WARN
def _node_delta_signals(node_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
signals: list[dict[str, Any]] = []
for entry in node_context:
if not isinstance(entry, dict):
continue
node = entry.get("node")
deltas = entry.get("baseline_delta") if isinstance(entry.get("baseline_delta"), dict) else {}
baseline = entry.get("baseline") if isinstance(entry.get("baseline"), dict) else {}
if not isinstance(node, str) or not node:
continue
for metric in ("cpu", "ram", "net", "io", "disk"):
delta = deltas.get(metric)
if not _delta_hit(delta):
continue
avg = baseline.get(metric, {}).get("avg") if isinstance(baseline.get(metric), dict) else None
signals.append(
{
"scope": "node",
"target": node,
"metric": metric,
"current": entry.get(metric),
"baseline_avg": avg,
"delta_pct": delta,
"severity": _delta_severity(float(delta)),
}
)
return signals
def _namespace_delta_signals(namespace_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
signals: list[dict[str, Any]] = []
for entry in namespace_context:
if not isinstance(entry, dict):
continue
namespace = entry.get("namespace")
deltas = entry.get("baseline_delta") if isinstance(entry.get("baseline_delta"), dict) else {}
baseline = entry.get("baseline") if isinstance(entry.get("baseline"), dict) else {}
if not isinstance(namespace, str) or not namespace:
continue
for metric, current_key in (("cpu", "cpu_usage"), ("mem", "mem_usage")):
delta = deltas.get(metric)
if not _delta_hit(delta):
continue
avg = baseline.get(metric, {}).get("avg") if isinstance(baseline.get(metric), dict) else None
signals.append(
{
"scope": "namespace",
"target": namespace,
"metric": metric,
"current": entry.get(current_key),
"baseline_avg": avg,
"delta_pct": delta,
"severity": _delta_severity(float(delta)),
}
)
return signals
def _kustomization_signals(kustomizations: dict[str, Any]) -> list[dict[str, Any]]:
count = int(kustomizations.get("not_ready") or 0) if isinstance(kustomizations, dict) else 0
if count <= 0:
return []
return [
{
"scope": "flux",
"target": "kustomizations",
"metric": "not_ready",
"current": count,
"severity": "warning",
}
]
def _pod_issue_signals(pod_issues: dict[str, Any]) -> list[dict[str, Any]]:
if not isinstance(pod_issues, dict):
return []
signals: list[dict[str, Any]] = []
pending_over = int(pod_issues.get("pending_over_15m") or 0)
if pending_over > 0:
signals.append(
{
"scope": "pods",
"target": "pending_over_15m",
"metric": "count",
"current": pending_over,
"severity": "warning",
}
)
counts = pod_issues.get("counts") if isinstance(pod_issues.get("counts"), dict) else {}
failed = int(counts.get("Failed") or 0) if isinstance(counts, dict) else 0
if failed > 0:
signals.append(
{
"scope": "pods",
"target": "failed",
"metric": "count",
"current": failed,
"severity": "critical",
}
)
return signals
def _workload_health_signals(workloads_health: dict[str, Any]) -> list[dict[str, Any]]:
not_ready = _workload_not_ready_items(workloads_health)
if not not_ready:
return []
output: list[dict[str, Any]] = []
for entry in not_ready[:5]:
output.append(
{
"scope": "workload",
"target": f"{entry.get('namespace')}/{entry.get('workload')}",
"metric": "not_ready",
"current": entry.get("ready") or 0,
"desired": entry.get("desired") or 0,
"severity": "warning",
}
)
return output
def _build_signals(context: SignalContext) -> list[dict[str, Any]]:
signals = (
_node_delta_signals(context.node_context)
+ _namespace_delta_signals(context.namespace_context)
+ _workload_health_signals(context.workloads_health)
+ _pod_issue_signals(context.pod_issues)
+ _kustomization_signals(context.kustomizations)
+ _pvc_pressure_signals(context.metrics)
)
signals.sort(key=lambda item: (_severity_rank(item.get("severity")), item.get("scope") or ""))
return signals[:_SIGNAL_LIMIT]
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,309 @@
from __future__ import annotations
import sys
from typing import Any, Callable
import httpx
from ..settings import settings
from .cluster_state_contract import *
from .cluster_state_flux_events import *
from .cluster_state_relationships import *
def _facade_override(name: str, original: Callable[..., Any]) -> Callable[..., Any] | None:
facade = sys.modules.get("ariadne.services.cluster_state")
candidate = getattr(facade, name, None) if facade is not None else None
if candidate is not None and candidate is not original:
return candidate
return None
def _vm_query(expr: str) -> list[dict[str, Any]] | None:
base = settings.vm_url
if not base:
return None
url = f"{base.rstrip('/')}/api/v1/query"
params = {"query": expr}
with httpx.Client(timeout=settings.cluster_state_vm_timeout_sec) as client:
resp = client.get(url, params=params)
resp.raise_for_status()
payload = resp.json()
if payload.get("status") != "success":
return None
data = payload.get("data") if isinstance(payload.get("data"), dict) else {}
result = data.get("result")
return result if isinstance(result, list) else None
def _vm_scalar(expr: str) -> float | None:
override = _facade_override("_vm_scalar", _vm_scalar)
if override is not None:
return override(expr)
result = _vm_query(expr)
if not result:
return None
value = result[0].get("value") if isinstance(result[0], dict) else None
if not isinstance(value, list) or len(value) < _VALUE_PAIR_LEN:
return None
try:
return float(value[1])
except (TypeError, ValueError):
return None
def _vm_vector(expr: str) -> list[dict[str, Any]]:
override = _facade_override("_vm_vector", _vm_vector)
if override is not None:
return override(expr)
result = _vm_query(expr) or []
output: list[dict[str, Any]] = []
for item in result:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
value = item.get("value") if isinstance(item.get("value"), list) else []
if len(value) < _VALUE_PAIR_LEN:
continue
try:
numeric = float(value[1])
except (TypeError, ValueError):
continue
output.append({"metric": metric, "value": numeric})
return output
def _alert_entries(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
value = item.get("value")
name = metric.get("alertname")
if not isinstance(name, str) or not name:
continue
severity = metric.get("severity") if isinstance(metric.get("severity"), str) else ""
output.append(
{
"alert": name,
"severity": severity,
"value": value,
}
)
output.sort(key=lambda item: (-(item.get("value") or 0), item.get("alert") or ""))
return output
def _vm_alerts_now() -> list[dict[str, Any]]:
entries = _vm_vector('sum by (alertname,severity) (ALERTS{alertstate="firing"})')
return _alert_entries(entries)[:_ALERT_TOP_LIMIT]
def _vm_alerts_trend(window: str) -> list[dict[str, Any]]:
entries = _vm_vector(
f"topk({_ALERT_TOP_LIMIT}, sum by (alertname,severity) (count_over_time(ALERTS{{alertstate=\"firing\"}}[{window}])))"
)
return _alert_entries(entries)
def _alertmanager_alerts(errors: list[str]) -> list[dict[str, Any]]:
base = settings.alertmanager_url
if not base:
return []
url = f"{base.rstrip('/')}/api/v2/alerts"
try:
with httpx.Client(timeout=settings.cluster_state_vm_timeout_sec) as client:
resp = client.get(url)
resp.raise_for_status()
payload = resp.json()
if isinstance(payload, list):
return [item for item in payload if isinstance(item, dict)]
except Exception as exc:
errors.append(f"alertmanager: {exc}")
return []
def _summarize_alerts(alerts: list[dict[str, Any]]) -> dict[str, Any]:
items: list[dict[str, Any]] = []
by_severity: dict[str, int] = {}
for alert in alerts:
labels = alert.get("labels") if isinstance(alert.get("labels"), dict) else {}
alertname = labels.get("alertname")
if not isinstance(alertname, str) or not alertname:
continue
severity = labels.get("severity") if isinstance(labels.get("severity"), str) else ""
items.append({"alert": alertname, "severity": severity})
if severity:
by_severity[severity] = by_severity.get(severity, 0) + 1
items.sort(key=lambda item: (item.get("severity") or "", item.get("alert") or ""))
return {
"total": len(items),
"by_severity": by_severity,
"items": items[:_ALERT_TOP_LIMIT],
}
def _filter_namespace_vector(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
if not isinstance(namespace, str) or not namespace:
continue
if namespace in _SYSTEM_NAMESPACES:
continue
output.append(item)
return output
def _vm_topk(expr: str, label_key: str) -> dict[str, Any] | None:
result = _vm_vector(expr)
if not result:
return None
metric = result[0].get("metric") if isinstance(result[0], dict) else {}
value = result[0].get("value")
label = metric.get(label_key) if isinstance(metric, dict) else None
return {"label": label or "", "value": value, "metric": metric}
def _vm_node_metric(expr: str, label_key: str) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in _vm_vector(expr):
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
label = metric.get(label_key)
value = item.get("value")
if isinstance(label, str) and label:
output.append({"node": label, "value": value})
output.sort(key=lambda item: item.get("node") or "")
return output
def _vm_baseline_map(expr: str, label_key: str, window: str) -> dict[str, dict[str, float]]:
averages = _vm_vector(f"avg_over_time(({expr})[{window}])")
maximums = _vm_vector(f"max_over_time(({expr})[{window}])")
baseline: dict[str, dict[str, float]] = {}
for item in averages:
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
label = metric.get(label_key)
if not isinstance(label, str) or not label:
continue
baseline.setdefault(label, {})["avg"] = float(item.get("value") or 0)
for item in maximums:
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
label = metric.get(label_key)
if not isinstance(label, str) or not label:
continue
baseline.setdefault(label, {})["max"] = float(item.get("value") or 0)
return baseline
def _baseline_map_to_list(baseline: dict[str, dict[str, float]], name_key: str) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for name, stats in baseline.items():
if not isinstance(name, str) or not name:
continue
output.append(
{
name_key: name,
"avg": stats.get("avg"),
"max": stats.get("max"),
}
)
output.sort(key=lambda item: (-(item.get("avg") or 0), item.get(name_key) or ""))
return output
def _limit_entries(entries: list[dict[str, Any]], limit: int) -> list[dict[str, Any]]:
if limit <= 0:
return []
return entries[:limit]
def _vm_window_series(expr: str, label_key: str, name_key: str, window: str) -> dict[str, list[dict[str, Any]]]:
avg = _vector_to_named(
_vm_vector(f"avg_over_time(({expr})[{window}])"),
label_key,
name_key,
)
max_values = _vector_to_named(
_vm_vector(f"max_over_time(({expr})[{window}])"),
label_key,
name_key,
)
p95 = _vector_to_named(
_vm_vector(f"quantile_over_time(0.95, ({expr})[{window}])"),
label_key,
name_key,
)
return {"avg": avg, "max": max_values, "p95": p95}
def _trim_window_series(series: dict[str, list[dict[str, Any]]], limit: int) -> dict[str, list[dict[str, Any]]]:
return {key: _limit_entries(entries, limit) for key, entries in series.items()}
def _build_metric_trends(exprs: dict[str, str], label_key: str, name_key: str, windows: tuple[str, ...], limit: int) -> dict[str, dict[str, dict[str, list[dict[str, Any]]]]]:
trends: dict[str, dict[str, dict[str, list[dict[str, Any]]]]] = {}
for metric, expr in exprs.items():
metric_trends: dict[str, dict[str, list[dict[str, Any]]]] = {}
for window in windows:
series = _vm_window_series(expr, label_key, name_key, window)
metric_trends[window] = _trim_window_series(series, limit)
trends[metric] = metric_trends
return trends
def _vm_scalar_window(expr: str, window: str, fn: str) -> float | None:
return _vm_scalar(f"{fn}(({expr})[{window}])")
def _scalar_trends(expr: str, windows: tuple[str, ...]) -> dict[str, dict[str, float | None]]:
return {
window: {
"avg": _vm_scalar_window(expr, window, "avg_over_time"),
"min": _vm_scalar_window(expr, window, "min_over_time"),
"max": _vm_scalar_window(expr, window, "max_over_time"),
}
for window in windows
}
def _cluster_trends() -> dict[str, dict[str, dict[str, float | None]]]:
exprs = {
"nodes_ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})',
"nodes_not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})',
"pods_running": 'sum(kube_pod_status_phase{phase="Running"})',
"pods_pending": 'sum(kube_pod_status_phase{phase="Pending"})',
"pods_failed": 'sum(kube_pod_status_phase{phase="Failed"})',
"pods_succeeded": 'sum(kube_pod_status_phase{phase="Succeeded"})',
"alerts_firing": 'sum(ALERTS{alertstate="firing"})',
"cpu_usage": f'sum(rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]))',
"mem_usage": 'sum(container_memory_working_set_bytes{namespace!=""})',
"net_io": (
f'sum(rate(container_network_receive_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) '
f'+ rate(container_network_transmit_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))'
),
"fs_io": (
f'sum(rate(container_fs_reads_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) '
f'+ rate(container_fs_writes_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))'
),
}
return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in exprs.items()}
def _node_condition_trends() -> dict[str, dict[str, dict[str, float | None]]]:
conditions = {
"ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})',
"not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})',
"unschedulable": "sum(kube_node_spec_unschedulable)",
}
for cond in _PRESSURE_TYPES:
conditions[cond.lower()] = (
f'sum(kube_node_status_condition{{condition="{cond}",status="true"}})'
)
return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in conditions.items()}
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,187 @@
from __future__ import annotations
from typing import Any
from .cluster_state_contract import *
from .cluster_state_relationships import *
from .cluster_state_vm_client import *
def _pod_reason_totals(
reasons: dict[str, str],
series: str,
) -> dict[str, dict[str, dict[str, float | None]]]:
totals: dict[str, dict[str, dict[str, float | None]]] = {}
for key, reason in reasons.items():
expr = f'sum({series}{{reason="{reason}"}})'
totals[key] = _scalar_trends(expr, _TREND_WINDOWS)
return totals
def _node_usage_exprs() -> dict[str, str]:
return {
"cpu": (
f'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))'
),
"ram": (
'avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) '
'/ node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))'
),
"net": (
f'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) '
'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))'
),
"io": (
f'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))'
),
"disk": (
'avg by (node) (((1 - avg by (instance) (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} '
'/ node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100) * on(instance) group_left(node) '
'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))'
),
}
def _namespace_usage_exprs() -> dict[str, str]:
return {
"cpu": f'sum by (namespace) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]))',
"mem": 'sum by (namespace) (container_memory_working_set_bytes{namespace!=""})',
}
def _namespace_request_exprs() -> dict[str, str]:
return {
"cpu_requests": "sum by (namespace) (kube_pod_container_resource_requests_cpu_cores)",
"mem_requests": "sum by (namespace) (kube_pod_container_resource_requests_memory_bytes)",
}
def _restart_namespace_trend(window: str) -> list[dict[str, Any]]:
entries = _vm_vector(
f"topk({_TREND_NAMESPACE_LIMIT}, sum by (namespace) (increase(kube_pod_container_status_restarts_total[{window}])))"
)
entries = _filter_namespace_vector(entries)
return _vector_to_named(entries, "namespace", "namespace")
def _job_failure_trend(window: str) -> list[dict[str, Any]]:
entries = _vm_vector(
f"topk({_TREND_JOB_LIMIT}, sum by (namespace,job_name) (increase(kube_job_status_failed[{window}])))"
)
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
job = metric.get("job_name")
if not isinstance(namespace, str) or not isinstance(job, str):
continue
output.append(
{
"namespace": namespace,
"job": job,
"value": item.get("value"),
}
)
output.sort(key=lambda item: (-(item.get("value") or 0), item.get("namespace") or "", item.get("job") or ""))
return output
def _pod_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]:
entries = _vm_vector(f"topk({limit}, sum by (namespace,pod) ({expr}))")
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
pod = metric.get("pod")
if not isinstance(namespace, str) or not isinstance(pod, str):
continue
output.append(
{
"namespace": namespace,
"pod": pod,
"value": item.get("value"),
}
)
output.sort(key=lambda item: (-(item.get("value") or 0), item.get("namespace") or "", item.get("pod") or ""))
return output
def _namespace_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]:
entries = _vm_vector(f"topk({limit}, sum by (namespace) ({expr}))")
entries = _filter_namespace_vector(entries)
return _vector_to_named(entries, "namespace", "namespace")
def _pod_waiting_now() -> dict[str, list[dict[str, Any]]]:
output: dict[str, list[dict[str, Any]]] = {}
for key, reason in _POD_WAITING_REASONS.items():
expr = f'kube_pod_container_status_waiting_reason{{reason="{reason}"}}'
output[key] = _pod_reason_entries(expr, _POD_REASON_LIMIT)
return output
def _pod_waiting_trends() -> dict[str, dict[str, list[dict[str, Any]]]]:
trends: dict[str, dict[str, list[dict[str, Any]]]] = {}
for key, reason in _POD_WAITING_REASONS.items():
expr = f'kube_pod_container_status_waiting_reason{{reason="{reason}"}}'
trends[key] = {
window: _pod_reason_entries(f"max_over_time(({expr})[{window}])", _POD_REASON_TREND_LIMIT)
for window in _TREND_WINDOWS
}
return trends
def _pod_terminated_now() -> dict[str, list[dict[str, Any]]]:
output: dict[str, list[dict[str, Any]]] = {}
for key, reason in _POD_TERMINATED_REASONS.items():
expr = f'kube_pod_container_status_terminated_reason{{reason="{reason}"}}'
output[key] = _pod_reason_entries(expr, _POD_REASON_LIMIT)
return output
def _pod_terminated_trends() -> dict[str, dict[str, list[dict[str, Any]]]]:
trends: dict[str, dict[str, list[dict[str, Any]]]] = {}
for key, reason in _POD_TERMINATED_REASONS.items():
expr = f'kube_pod_container_status_terminated_reason{{reason="{reason}"}}'
trends[key] = {
window: _pod_reason_entries(f"max_over_time(({expr})[{window}])", _POD_REASON_TREND_LIMIT)
for window in _TREND_WINDOWS
}
return trends
def _pods_phase_trends() -> dict[str, dict[str, dict[str, float | None]]]:
phases = {
"running": "sum(kube_pod_status_phase{phase=\"Running\"})",
"pending": "sum(kube_pod_status_phase{phase=\"Pending\"})",
"failed": "sum(kube_pod_status_phase{phase=\"Failed\"})",
}
trends: dict[str, dict[str, dict[str, float | None]]] = {}
for window in _TREND_WINDOWS:
window_entry: dict[str, dict[str, float | None]] = {}
for name, expr in phases.items():
window_entry[name] = {
"avg": _vm_scalar_window(expr, window, "avg_over_time"),
"max": _vm_scalar_window(expr, window, "max_over_time"),
}
trends[window] = window_entry
return trends
def _pvc_usage_trends() -> dict[str, list[dict[str, Any]]]:
trends: dict[str, list[dict[str, Any]]] = {}
expr = "kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100"
for window in _TREND_WINDOWS:
entries = _vm_vector(
f"topk({_TREND_PVC_LIMIT}, max_over_time(({expr})[{window}]))"
)
trends[window] = _pvc_top(entries)
return trends
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,330 @@
from __future__ import annotations
from typing import Any
from .cluster_state_contract import *
from .cluster_state_vm_client import *
from .cluster_state_vm_trends import *
def _postgres_connections(errors: list[str]) -> dict[str, Any]:
postgres: dict[str, Any] = {}
try:
postgres["used"] = _vm_scalar("sum(pg_stat_activity_count)")
postgres["max"] = _vm_scalar("max(pg_settings_max_connections)")
postgres["by_db"] = _vm_vector(
"topk(5, sum by (datname) (pg_stat_activity_count))"
)
postgres["hottest_db"] = _vm_topk(
"topk(1, sum by (datname) (pg_stat_activity_count))",
"datname",
)
except Exception as exc:
errors.append(f"postgres: {exc}")
return postgres
def _hottest_nodes(errors: list[str]) -> dict[str, Any]:
hottest: dict[str, Any] = {}
try:
hottest["cpu"] = _vm_topk(
f'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
f'* on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node",
)
hottest["ram"] = _vm_topk(
f'label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) '
f'/ node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node",
)
hottest["net"] = _vm_topk(
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node",
)
hottest["io"] = _vm_topk(
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
f'* on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node",
)
except Exception as exc:
errors.append(f"hottest: {exc}")
return hottest
def _node_usage(errors: list[str]) -> dict[str, Any]:
usage: dict[str, Any] = {}
try:
exprs = _node_usage_exprs()
usage["cpu"] = _vm_node_metric(exprs["cpu"], "node")
usage["ram"] = _vm_node_metric(exprs["ram"], "node")
usage["net"] = _vm_node_metric(exprs["net"], "node")
usage["io"] = _vm_node_metric(exprs["io"], "node")
usage["disk"] = _vm_node_metric(exprs["disk"], "node")
except Exception as exc:
errors.append(f"node_usage: {exc}")
return usage
def _pvc_usage(errors: list[str]) -> list[dict[str, Any]]:
try:
entries = _vm_vector(
"topk(5, max by (namespace,persistentvolumeclaim) "
"(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))"
)
return _filter_namespace_vector(entries)
except Exception as exc:
errors.append(f"pvc_usage: {exc}")
return []
def _usage_stats(series: list[dict[str, Any]]) -> dict[str, float]:
values: list[float] = []
for entry in series:
if not isinstance(entry, dict):
continue
try:
values.append(float(entry.get("value")))
except (TypeError, ValueError):
continue
if not values:
return {}
return {
"min": min(values),
"max": max(values),
"avg": sum(values) / len(values),
}
def _vm_namespace_totals(expr: str) -> dict[str, float]:
totals: dict[str, float] = {}
for item in _vm_vector(expr):
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
if not isinstance(namespace, str) or not namespace:
continue
try:
totals[namespace] = float(item.get("value"))
except (TypeError, ValueError):
continue
return totals
def _build_namespace_capacity(
cpu_usage: dict[str, float],
cpu_requests: dict[str, float],
mem_usage: dict[str, float],
mem_requests: dict[str, float],
) -> list[dict[str, Any]]:
namespaces = sorted(set(cpu_usage) | set(cpu_requests) | set(mem_usage) | set(mem_requests))
output: list[dict[str, Any]] = []
for namespace in namespaces:
cpu_used = cpu_usage.get(namespace)
cpu_req = cpu_requests.get(namespace)
mem_used = mem_usage.get(namespace)
mem_req = mem_requests.get(namespace)
cpu_ratio = None
mem_ratio = None
if isinstance(cpu_used, (int, float)) and isinstance(cpu_req, (int, float)) and cpu_req > 0:
cpu_ratio = cpu_used / cpu_req
if isinstance(mem_used, (int, float)) and isinstance(mem_req, (int, float)) and mem_req > 0:
mem_ratio = mem_used / mem_req
output.append(
{
"namespace": namespace,
"cpu_usage": cpu_used,
"cpu_requests": cpu_req,
"cpu_usage_ratio": cpu_ratio,
"mem_usage": mem_used,
"mem_requests": mem_req,
"mem_usage_ratio": mem_ratio,
}
)
output.sort(
key=lambda item: (
-(item.get("cpu_requests") or 0),
-(item.get("mem_requests") or 0),
item.get("namespace") or "",
)
)
return output
def _node_usage_profile(
node_usage: dict[str, list[dict[str, Any]]],
node_details: list[dict[str, Any]],
node_pods: list[dict[str, Any]],
) -> list[dict[str, Any]]:
usage: dict[str, dict[str, Any]] = {}
for key in ("cpu", "ram", "disk", "net", "io"):
for item in node_usage.get(key, []) or []:
node = item.get("node")
value = item.get("value")
if not isinstance(node, str) or not node:
continue
if not isinstance(value, (int, float)):
continue
usage.setdefault(node, {})[key] = float(value)
max_values: dict[str, float] = {}
for key in ("cpu", "ram", "disk", "net", "io"):
values = [entry.get(key) for entry in usage.values() if isinstance(entry.get(key), (int, float))]
max_values[key] = max(values) if values else 0.0
detail_map: dict[str, dict[str, Any]] = {
entry.get("name"): entry for entry in node_details if isinstance(entry, dict)
}
pod_map: dict[str, dict[str, Any]] = {
entry.get("node"): entry for entry in node_pods if isinstance(entry, dict)
}
output: list[dict[str, Any]] = []
for node, entry in usage.items():
detail = detail_map.get(node, {})
pressure = detail.get("pressure") if isinstance(detail.get("pressure"), dict) else {}
pressure_count = sum(1 for value in pressure.values() if value)
taints = detail.get("taints") if isinstance(detail.get("taints"), list) else []
unschedulable = bool(detail.get("unschedulable"))
pods_total = None
pod_entry = pod_map.get(node)
if isinstance(pod_entry, dict):
pods_total = pod_entry.get("pods_total")
normalized: dict[str, float] = {}
for key in ("cpu", "ram", "disk", "net", "io"):
raw = entry.get(key)
max_val = max_values.get(key) or 0.0
if isinstance(raw, (int, float)) and max_val > 0:
normalized[f"{key}_norm"] = raw / max_val
norm_values = [v for v in normalized.values() if isinstance(v, (int, float))]
load_index = sum(norm_values) / len(norm_values) if norm_values else None
output.append(
{
"node": node,
"cpu": entry.get("cpu"),
"ram": entry.get("ram"),
"disk": entry.get("disk"),
"net": entry.get("net"),
"io": entry.get("io"),
**normalized,
"pressure_flags": pressure,
"pressure_count": pressure_count,
"taints": taints,
"unschedulable": unschedulable,
"pods_total": pods_total,
"load_index": load_index,
}
)
output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("node") or ""))
return output
def _percentile(values: list[float], percentile: float) -> float | None:
if not values:
return None
ordered = sorted(values)
idx = int(round((len(ordered) - 1) * percentile))
idx = min(max(idx, 0), len(ordered) - 1)
return ordered[idx]
def _node_load_summary(node_load: list[dict[str, Any]]) -> dict[str, Any]:
items = [
entry
for entry in node_load
if isinstance(entry, dict) and isinstance(entry.get("load_index"), (int, float))
]
if not items:
return {}
values = [float(entry.get("load_index") or 0) for entry in items]
avg = sum(values) / len(values)
variance = sum((value - avg) ** 2 for value in values) / len(values)
stddev = variance**0.5
top = sorted(items, key=lambda item: -(item.get("load_index") or 0))[:_LOAD_TOP_COUNT]
bottom = sorted(items, key=lambda item: (item.get("load_index") or 0))[:_LOAD_TOP_COUNT]
outliers = [
item
for item in items
if isinstance(item.get("load_index"), (int, float))
and item.get("load_index") >= avg + stddev
]
outliers.sort(key=lambda item: -(item.get("load_index") or 0))
return {
"avg": round(avg, 3),
"p90": round(_percentile(values, 0.9) or 0.0, 3),
"min": round(min(values), 3),
"max": round(max(values), 3),
"top": top,
"bottom": bottom,
"outliers": outliers[:_LOAD_TOP_COUNT],
}
def _namespace_capacity_summary(capacity: list[dict[str, Any]]) -> dict[str, Any]:
if not capacity:
return {}
cpu_ratio = [
entry
for entry in capacity
if isinstance(entry, dict) and isinstance(entry.get("cpu_usage_ratio"), (int, float))
]
mem_ratio = [
entry
for entry in capacity
if isinstance(entry, dict) and isinstance(entry.get("mem_usage_ratio"), (int, float))
]
cpu_ratio.sort(key=lambda item: -(item.get("cpu_usage_ratio") or 0))
mem_ratio.sort(key=lambda item: -(item.get("mem_usage_ratio") or 0))
cpu_headroom: list[dict[str, Any]] = []
mem_headroom: list[dict[str, Any]] = []
for entry in capacity:
if not isinstance(entry, dict):
continue
cpu_used = entry.get("cpu_usage")
cpu_req = entry.get("cpu_requests")
mem_used = entry.get("mem_usage")
mem_req = entry.get("mem_requests")
if isinstance(cpu_used, (int, float)) and isinstance(cpu_req, (int, float)):
cpu_headroom.append(
{
"namespace": entry.get("namespace"),
"headroom": cpu_req - cpu_used,
"usage": cpu_used,
"requests": cpu_req,
"ratio": entry.get("cpu_usage_ratio"),
}
)
if isinstance(mem_used, (int, float)) and isinstance(mem_req, (int, float)):
mem_headroom.append(
{
"namespace": entry.get("namespace"),
"headroom": mem_req - mem_used,
"usage": mem_used,
"requests": mem_req,
"ratio": entry.get("mem_usage_ratio"),
}
)
cpu_headroom.sort(key=lambda item: (item.get("headroom") or 0))
mem_headroom.sort(key=lambda item: (item.get("headroom") or 0))
cpu_over_names = [
entry.get("namespace")
for entry in cpu_ratio
if (entry.get("cpu_usage_ratio") or 0) > 1 and entry.get("namespace")
]
mem_over_names = [
entry.get("namespace")
for entry in mem_ratio
if (entry.get("mem_usage_ratio") or 0) > 1 and entry.get("namespace")
]
over_cpu = len(cpu_over_names)
over_mem = len(mem_over_names)
return {
"cpu_ratio_top": cpu_ratio[:_NAMESPACE_TOP_COUNT],
"mem_ratio_top": mem_ratio[:_NAMESPACE_TOP_COUNT],
"cpu_headroom_low": cpu_headroom[:_NAMESPACE_TOP_COUNT],
"mem_headroom_low": mem_headroom[:_NAMESPACE_TOP_COUNT],
"cpu_overcommitted": over_cpu,
"mem_overcommitted": over_mem,
"cpu_overcommitted_names": sorted({name for name in cpu_over_names if isinstance(name, str)}),
"mem_overcommitted_names": sorted({name for name in mem_over_names if isinstance(name, str)}),
}
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -0,0 +1,249 @@
from __future__ import annotations
from typing import Any
from .cluster_state_contract import *
from .cluster_state_nodes import *
def _summarize_jobs(payload: dict[str, Any]) -> dict[str, Any]:
totals = {"total": 0, "active": 0, "failed": 0, "succeeded": 0}
by_namespace: dict[str, dict[str, int]] = {}
failing: list[dict[str, Any]] = []
active_oldest: list[dict[str, Any]] = []
for job in _items(payload):
metadata = job.get("metadata") if isinstance(job.get("metadata"), dict) else {}
status = job.get("status") if isinstance(job.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
created_at = (
metadata.get("creationTimestamp")
if isinstance(metadata.get("creationTimestamp"), str)
else ""
)
if not name or not namespace:
continue
active = int(status.get("active") or 0)
failed = int(status.get("failed") or 0)
succeeded = int(status.get("succeeded") or 0)
totals["total"] += 1
totals["active"] += active
totals["failed"] += failed
totals["succeeded"] += succeeded
entry = by_namespace.setdefault(namespace, {"active": 0, "failed": 0, "succeeded": 0})
entry["active"] += active
entry["failed"] += failed
entry["succeeded"] += succeeded
age_hours = _age_hours(created_at)
if failed > 0:
failing.append(
{
"namespace": namespace,
"job": name,
"failed": failed,
"age_hours": age_hours,
}
)
if active > 0 and age_hours is not None:
active_oldest.append(
{
"namespace": namespace,
"job": name,
"active": active,
"age_hours": age_hours,
}
)
failing.sort(
key=lambda item: (
-(item.get("failed") or 0),
-(item.get("age_hours") or 0.0),
item.get("namespace") or "",
item.get("job") or "",
)
)
active_oldest.sort(key=lambda item: -(item.get("age_hours") or 0.0))
namespace_summary = [
{
"namespace": ns,
"active": stats.get("active", 0),
"failed": stats.get("failed", 0),
"succeeded": stats.get("succeeded", 0),
}
for ns, stats in by_namespace.items()
]
namespace_summary.sort(
key=lambda item: (
-(item.get("active") or 0),
-(item.get("failed") or 0),
item.get("namespace") or "",
)
)
return {
"totals": totals,
"by_namespace": namespace_summary[:20],
"failing": failing[:20],
"active_oldest": active_oldest[:20],
}
def _summarize_deployments(payload: dict[str, Any]) -> dict[str, Any]:
items = _items(payload)
unhealthy: list[dict[str, Any]] = []
for dep in items:
metadata = dep.get("metadata") if isinstance(dep.get("metadata"), dict) else {}
spec = dep.get("spec") if isinstance(dep.get("spec"), dict) else {}
status = dep.get("status") if isinstance(dep.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
desired = int(spec.get("replicas") or 0)
ready = int(status.get("readyReplicas") or 0)
available = int(status.get("availableReplicas") or 0)
updated = int(status.get("updatedReplicas") or 0)
if desired <= 0:
continue
if ready < desired or available < desired:
unhealthy.append(
{
"name": name,
"namespace": namespace,
"desired": desired,
"ready": ready,
"available": available,
"updated": updated,
}
)
unhealthy.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
return {
"total": len(items),
"not_ready": len(unhealthy),
"items": unhealthy,
}
def _summarize_statefulsets(payload: dict[str, Any]) -> dict[str, Any]:
items = _items(payload)
unhealthy: list[dict[str, Any]] = []
for st in items:
metadata = st.get("metadata") if isinstance(st.get("metadata"), dict) else {}
spec = st.get("spec") if isinstance(st.get("spec"), dict) else {}
status = st.get("status") if isinstance(st.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
desired = int(spec.get("replicas") or 0)
ready = int(status.get("readyReplicas") or 0)
current = int(status.get("currentReplicas") or 0)
updated = int(status.get("updatedReplicas") or 0)
if desired <= 0:
continue
if ready < desired:
unhealthy.append(
{
"name": name,
"namespace": namespace,
"desired": desired,
"ready": ready,
"current": current,
"updated": updated,
}
)
unhealthy.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
return {
"total": len(items),
"not_ready": len(unhealthy),
"items": unhealthy,
}
def _summarize_daemonsets(payload: dict[str, Any]) -> dict[str, Any]:
items = _items(payload)
unhealthy: list[dict[str, Any]] = []
for ds in items:
metadata = ds.get("metadata") if isinstance(ds.get("metadata"), dict) else {}
status = ds.get("status") if isinstance(ds.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
desired = int(status.get("desiredNumberScheduled") or 0)
ready = int(status.get("numberReady") or 0)
updated = int(status.get("updatedNumberScheduled") or 0)
if desired <= 0:
continue
if ready < desired:
unhealthy.append(
{
"name": name,
"namespace": namespace,
"desired": desired,
"ready": ready,
"updated": updated,
}
)
unhealthy.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
return {
"total": len(items),
"not_ready": len(unhealthy),
"items": unhealthy,
}
def _summarize_workload_health(
deployments: dict[str, Any],
statefulsets: dict[str, Any],
daemonsets: dict[str, Any],
) -> dict[str, Any]:
return {
"deployments": deployments,
"statefulsets": statefulsets,
"daemonsets": daemonsets,
}
def _summarize_longhorn_volumes(payload: dict[str, Any]) -> dict[str, Any]:
items = _items(payload)
if not items:
return {}
by_state: dict[str, int] = {}
by_robustness: dict[str, int] = {}
degraded: list[dict[str, Any]] = []
attached_count = 0
detached_count = 0
degraded_count = 0
for volume in items:
metadata = volume.get("metadata") if isinstance(volume.get("metadata"), dict) else {}
status = volume.get("status") if isinstance(volume.get("status"), dict) else {}
spec = volume.get("spec") if isinstance(volume.get("spec"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
if not name:
continue
state = status.get("state") if isinstance(status.get("state"), str) else "unknown"
robustness = (
status.get("robustness") if isinstance(status.get("robustness"), str) else "unknown"
)
state_lower = state.lower()
robustness_lower = robustness.lower()
by_state[state] = by_state.get(state, 0) + 1
by_robustness[robustness] = by_robustness.get(robustness, 0) + 1
if state_lower == "attached":
attached_count += 1
elif state_lower == "detached":
detached_count += 1
if robustness_lower in {"degraded", "faulted"}:
degraded_count += 1
degraded.append(
{
"name": name,
"state": state,
"robustness": robustness,
"size": spec.get("size"),
"actual_size": status.get("actualSize"),
}
)
degraded.sort(key=lambda item: item.get("name") or "")
return {
"total": len(items),
"by_state": by_state,
"by_robustness": by_robustness,
"attached_count": attached_count,
"detached_count": detached_count,
"degraded": degraded,
"degraded_count": degraded_count,
}
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]

View File

@ -1,93 +1,26 @@
from __future__ import annotations
from dataclasses import dataclass
import base64
import time
import urllib.parse
from typing import Any
import httpx
import psycopg
from ..settings import settings
from ..utils.logging import get_logger
from ..utils.name_generator import NameGenerator
from .comms_guest_names import _CommsGuestNameMixin
from .comms_protocol import _canon_user
from .comms_room_ops import _CommsRoomOpsMixin
logger = get_logger(__name__)
class CommsService(_CommsGuestNameMixin, _CommsRoomOpsMixin):
"""Maintain Matrix/MAS guest naming and room hygiene.
HTTP_OK = 200
HTTP_CREATED = 201
HTTP_ACCEPTED = 202
HTTP_NO_CONTENT = 204
HTTP_BAD_REQUEST = 400
HTTP_NOT_FOUND = 404
HTTP_CONFLICT = 409
Inputs: Matrix/MAS endpoints, service credentials, and optional database access
from settings. Outputs: scheduled maintenance actions plus small status dicts
for scheduler logging.
"""
@dataclass(frozen=True)
class CommsSummary:
processed: int
renamed: int
pruned: int
skipped: int
detail: str = ""
@dataclass(frozen=True)
class MasGuestResult:
renamed: int
skipped: int
usernames: set[str]
@dataclass(frozen=True)
class SynapseGuestResult:
renamed: int
pruned: int
@dataclass(frozen=True)
class DisplayNameTarget:
room_id: str
user_id: str
name: str
in_room: bool
@dataclass(frozen=True)
class SynapseUserRef:
entry: dict[str, Any]
user_id: str
localpart: str
def _auth(token: str) -> dict[str, str]:
return {"Authorization": f"Bearer {token}"}
def _canon_user(user: str, server_name: str) -> str:
user = (user or "").strip()
if user.startswith("@") and ":" in user:
return user
user = user.lstrip("@")
if ":" in user:
return f"@{user}"
return f"@{user}:{server_name}"
def _needs_rename_username(username: str) -> bool:
return username.isdigit() or username.startswith("guest-")
def _needs_rename_display(display: str | None) -> bool:
if not display:
return True
return display.isdigit() or display.startswith("guest-")
class CommsService:
def __init__(
self,
client_factory: type[httpx.Client] = httpx.Client,
@ -96,6 +29,10 @@ class CommsService:
self._client_factory = client_factory
self._name_generator = name_generator or NameGenerator()
@property
def _settings(self) -> Any:
return settings
def _pick_guest_name(self, existing: set[str]) -> str | None:
return self._name_generator.unique(existing)
@ -106,838 +43,22 @@ class CommsService:
token = getattr(settings, "comms_synapse_admin_token", "")
return token if token else fallback
def _mas_admin_token(self, client: httpx.Client) -> str:
if not settings.comms_mas_admin_client_id or not settings.comms_mas_admin_client_secret:
raise RuntimeError("mas admin client credentials missing")
basic = base64.b64encode(
f"{settings.comms_mas_admin_client_id}:{settings.comms_mas_admin_client_secret}".encode()
).decode()
last_err: Exception | None = None
for attempt in range(5):
try:
resp = client.post(
settings.comms_mas_token_url,
headers={"Authorization": f"Basic {basic}"},
data={"grant_type": "client_credentials", "scope": "urn:mas:admin"},
)
resp.raise_for_status()
payload = resp.json()
token = payload.get("access_token")
if not isinstance(token, str) or not token:
raise RuntimeError("missing mas access token")
return token
except Exception as exc: # noqa: BLE001
last_err = exc
time.sleep(2**attempt)
raise RuntimeError(str(last_err) if last_err else "mas admin token failed")
def _mas_user_id(self, client: httpx.Client, token: str, username: str) -> str:
url = f"{settings.comms_mas_admin_api_base}/users/by-username/{urllib.parse.quote(username)}"
resp = client.get(url, headers=_auth(token))
resp.raise_for_status()
payload = resp.json()
return payload["data"]["id"]
def _mas_personal_session(self, client: httpx.Client, token: str, user_id: str) -> tuple[str, str]:
resp = client.post(
f"{settings.comms_mas_admin_api_base}/personal-sessions",
headers=_auth(token),
json={
"actor_user_id": user_id,
"human_name": "guest-name-randomizer",
"scope": "urn:matrix:client:api:*",
"expires_in": 300,
},
)
resp.raise_for_status()
payload = resp.json().get("data", {})
session_id = payload.get("id")
attrs = (payload.get("attributes") or {}) if isinstance(payload, dict) else {}
access_token = attrs.get("access_token")
if not isinstance(access_token, str) or not isinstance(session_id, str):
raise RuntimeError("invalid personal session response")
return access_token, session_id
def _mas_revoke_session(self, client: httpx.Client, token: str, session_id: str) -> None:
try:
client.post(
f"{settings.comms_mas_admin_api_base}/personal-sessions/{urllib.parse.quote(session_id)}/revoke",
headers=_auth(token),
json={},
)
except Exception:
return
def _resolve_alias(self, client: httpx.Client, token: str, alias: str) -> str:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
)
resp.raise_for_status()
payload = resp.json()
return payload["room_id"]
def _room_members(self, client: httpx.Client, token: str, room_id: str) -> tuple[set[str], set[str]]:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/members",
headers=_auth(token),
)
resp.raise_for_status()
payload = resp.json()
members: set[str] = set()
existing: set[str] = set()
for ev in payload.get("chunk", []) or []:
user_id = ev.get("state_key")
if isinstance(user_id, str) and user_id:
members.add(user_id)
display = (ev.get("content") or {}).get("displayname")
if isinstance(display, str) and display:
existing.add(display)
return members, existing
def _mas_list_users(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
users: list[dict[str, Any]] = []
cursor = None
while True:
url = f"{settings.comms_mas_admin_api_base}/users?page[size]=100"
if cursor:
url += f"&page[after]={urllib.parse.quote(cursor)}"
resp = client.get(url, headers=_auth(token))
resp.raise_for_status()
payload = resp.json()
data = payload.get("data") or []
if not isinstance(data, list) or not data:
break
users.extend([item for item in data if isinstance(item, dict)])
last = data[-1]
cursor = (
last.get("meta", {})
if isinstance(last, dict)
else {}
).get("page", {}).get("cursor")
if not cursor:
break
return users
def _synapse_list_users(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
users: list[dict[str, Any]] = []
from_token = None
admin_token = self._admin_token(token)
while True:
url = "{}/_synapse/admin/v2/users?local=true&deactivated=false&limit=100".format(
settings.comms_synapse_base
)
if from_token:
url += f"&from={urllib.parse.quote(from_token)}"
resp = client.get(url, headers=_auth(admin_token))
resp.raise_for_status()
payload = resp.json()
users.extend([item for item in payload.get("users", []) if isinstance(item, dict)])
from_token = payload.get("next_token")
if not from_token:
break
return users
def _should_prune_guest(self, entry: dict[str, Any], now_ms: int) -> bool:
if not entry.get("is_guest"):
return False
last_seen = entry.get("last_seen_ts")
if last_seen is None:
return False
try:
last_seen = int(last_seen)
except (TypeError, ValueError):
return False
stale_ms = int(settings.comms_guest_stale_days) * 24 * 60 * 60 * 1000
return now_ms - last_seen > stale_ms
def _prune_guest(self, client: httpx.Client, token: str, user_id: str) -> bool:
admin_token = self._admin_token(token)
try:
resp = client.delete(
f"{settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
params={"erase": "true"},
)
except Exception as exc: # noqa: BLE001
logger.info(
"guest prune failed",
extra={"event": "comms_guest_prune", "status": "error", "detail": str(exc)},
)
return False
if resp.status_code in (HTTP_OK, HTTP_ACCEPTED, HTTP_NO_CONTENT, HTTP_NOT_FOUND):
return True
logger.info(
"guest prune failed",
extra={
"event": "comms_guest_prune",
"status": "error",
"detail": f"{resp.status_code} {resp.text}",
},
)
return False
def _get_displayname(self, client: httpx.Client, token: str, user_id: str) -> str | None:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/profile/{urllib.parse.quote(user_id)}",
headers=_auth(token),
)
resp.raise_for_status()
return resp.json().get("displayname")
def _get_displayname_admin(self, client: httpx.Client, token: str, user_id: str) -> str | None:
admin_token = self._admin_token(token)
resp = client.get(
f"{settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
)
if resp.status_code == HTTP_NOT_FOUND:
return None
resp.raise_for_status()
return resp.json().get("displayname")
def _set_displayname(
self,
client: httpx.Client,
token: str,
target: DisplayNameTarget,
) -> None:
resp = client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/profile/{urllib.parse.quote(target.user_id)}/displayname",
headers=_auth(token),
json={"displayname": target.name},
)
resp.raise_for_status()
if not target.in_room:
return
state_url = (
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(target.room_id)}"
f"/state/m.room.member/{urllib.parse.quote(target.user_id)}"
)
client.put(
state_url,
headers=_auth(token),
json={"membership": "join", "displayname": target.name},
)
def _set_displayname_admin(self, client: httpx.Client, token: str, user_id: str, name: str) -> bool:
admin_token = self._admin_token(token)
resp = client.put(
f"{settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
json={"displayname": name},
)
return resp.status_code in (HTTP_OK, HTTP_CREATED, HTTP_NO_CONTENT)
def _db_rename_numeric(self, existing: set[str]) -> int:
if not settings.comms_synapse_db_password:
return 0
renamed = 0
conn = psycopg.connect(
def _connect_synapse_db(self) -> Any:
return psycopg.connect(
host=settings.comms_synapse_db_host,
port=settings.comms_synapse_db_port,
dbname=settings.comms_synapse_db_name,
user=settings.comms_synapse_db_user,
password=settings.comms_synapse_db_password,
)
try:
with conn:
with conn.cursor() as cur:
pattern = f"^@\\d+:{settings.comms_server_name}$"
cur.execute(
"SELECT user_id, full_user_id, displayname FROM profiles WHERE full_user_id ~ %s",
(pattern,),
)
profile_rows = cur.fetchall()
profile_index = {row[1]: row for row in profile_rows}
for _user_id, full_user_id, display in profile_rows:
if display and not _needs_rename_display(display):
continue
new_name = self._pick_guest_name(existing)
if not new_name:
continue
cur.execute(
"UPDATE profiles SET displayname = %s WHERE full_user_id = %s",
(new_name, full_user_id),
)
renamed += 1
cur.execute(
"SELECT name FROM users WHERE name ~ %s",
(pattern,),
)
users = [row[0] for row in cur.fetchall()]
if not users:
return renamed
cur.execute(
"SELECT user_id, full_user_id FROM profiles WHERE full_user_id = ANY(%s)",
(users,),
)
for existing_full in cur.fetchall():
profile_index.setdefault(existing_full[1], existing_full)
def _sleep(self, seconds: float) -> None:
time.sleep(seconds)
for full_user_id in users:
if full_user_id in profile_index:
continue
localpart = full_user_id.split(":", 1)[0].lstrip("@")
new_name = self._pick_guest_name(existing)
if not new_name:
continue
cur.execute(
"INSERT INTO profiles (user_id, displayname, full_user_id) VALUES (%s, %s, %s) "
"ON CONFLICT (full_user_id) DO UPDATE SET displayname = EXCLUDED.displayname",
(localpart, new_name, full_user_id),
)
renamed += 1
finally:
conn.close()
return renamed
def _validate_guest_name_settings(self) -> None:
if not settings.comms_mas_admin_client_id or not settings.comms_mas_admin_client_secret:
raise RuntimeError("comms mas admin secret missing")
if not settings.comms_synapse_base:
raise RuntimeError("comms synapse base missing")
def _room_context(self, client: httpx.Client, token: str) -> tuple[str, set[str], set[str]]:
room_id = self._resolve_alias(client, token, settings.comms_room_alias)
members, existing = self._room_members(client, token, room_id)
return room_id, members, existing
def _rename_mas_guests(
self,
client: httpx.Client,
admin_token: str,
room_id: str,
members: set[str],
existing: set[str],
) -> MasGuestResult:
renamed = 0
skipped = 0
mas_usernames: set[str] = set()
users = self._mas_list_users(client, admin_token)
for user in users:
attrs = user.get("attributes") or {}
username = attrs.get("username") or ""
if isinstance(username, str) and username:
mas_usernames.add(username)
legacy_guest = attrs.get("legacy_guest")
if not isinstance(username, str) or not username:
skipped += 1
continue
if not (legacy_guest or _needs_rename_username(username)):
skipped += 1
continue
user_id = user.get("id")
if not isinstance(user_id, str) or not user_id:
skipped += 1
continue
full_user = f"@{username}:{settings.comms_server_name}"
access_token, session_id = self._mas_personal_session(client, admin_token, user_id)
try:
display = self._get_displayname(client, access_token, full_user)
if display and not _needs_rename_display(display):
skipped += 1
continue
new_name = self._pick_guest_name(existing)
if not new_name:
skipped += 1
continue
self._set_displayname(
client,
access_token,
DisplayNameTarget(
room_id=room_id,
user_id=full_user,
name=new_name,
in_room=full_user in members,
),
)
renamed += 1
finally:
self._mas_revoke_session(client, admin_token, session_id)
return MasGuestResult(renamed=renamed, skipped=skipped, usernames=mas_usernames)
def _synapse_entries(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
try:
return self._synapse_list_users(client, token)
except Exception as exc: # noqa: BLE001
logger.info(
"synapse admin list skipped",
extra={"event": "comms_guest_list", "status": "error", "detail": str(exc)},
)
return []
def _synapse_user_id(self, entry: dict[str, Any]) -> SynapseUserRef | None:
user_id = entry.get("name") or ""
if not isinstance(user_id, str) or not user_id.startswith("@"):
return None
localpart = user_id.split(":", 1)[0].lstrip("@")
return SynapseUserRef(entry=entry, user_id=user_id, localpart=localpart)
def _maybe_prune_synapse_guest(
self,
client: httpx.Client,
token: str,
entry: dict[str, Any],
user_id: str,
now_ms: int,
) -> bool:
if not entry.get("is_guest"):
return False
if not self._should_prune_guest(entry, now_ms):
return False
return self._prune_guest(client, token, user_id)
def _needs_synapse_rename(
self,
client: httpx.Client,
token: str,
user: SynapseUserRef,
mas_usernames: set[str],
) -> bool:
if user.localpart in mas_usernames:
return False
is_guest = user.entry.get("is_guest")
if not (is_guest or _needs_rename_username(user.localpart)):
return False
display = self._get_displayname_admin(client, token, user.user_id)
if display and not _needs_rename_display(display):
return False
return True
def _rename_synapse_user(
self,
client: httpx.Client,
token: str,
existing: set[str],
user_id: str,
) -> bool:
new_name = self._pick_guest_name(existing)
if not new_name:
return False
return self._set_displayname_admin(client, token, user_id, new_name)
def _rename_synapse_guests(
self,
client: httpx.Client,
token: str,
existing: set[str],
mas_usernames: set[str],
) -> SynapseGuestResult:
renamed = 0
pruned = 0
entries = self._synapse_entries(client, token)
now_ms = int(time.time() * 1000)
for entry in entries:
user_ref = self._synapse_user_id(entry)
if not user_ref:
continue
if self._maybe_prune_synapse_guest(client, token, user_ref.entry, user_ref.user_id, now_ms):
pruned += 1
continue
if not self._needs_synapse_rename(client, token, user_ref, mas_usernames):
continue
if self._rename_synapse_user(client, token, existing, user_ref.user_id):
renamed += 1
return SynapseGuestResult(renamed=renamed, pruned=pruned)
def run_guest_name_randomizer(self, wait: bool = True) -> dict[str, Any]:
self._validate_guest_name_settings()
with self._client() as client:
admin_token = self._mas_admin_token(client)
seeder_id = self._mas_user_id(client, admin_token, settings.comms_seeder_user)
seeder_token, seeder_session = self._mas_personal_session(client, admin_token, seeder_id)
try:
room_id, members, existing = self._room_context(client, seeder_token)
mas_result = self._rename_mas_guests(client, admin_token, room_id, members, existing)
synapse_result = self._rename_synapse_guests(
client,
seeder_token,
existing,
mas_result.usernames,
)
db_renamed = self._db_rename_numeric(existing)
finally:
self._mas_revoke_session(client, admin_token, seeder_session)
renamed = mas_result.renamed + synapse_result.renamed + db_renamed
pruned = synapse_result.pruned
skipped = mas_result.skipped
processed = renamed + pruned + skipped
summary = CommsSummary(processed, renamed, pruned, skipped)
logger.info(
"comms guest name sync finished",
extra={
"event": "comms_guest_name",
"status": "ok",
"processed": summary.processed,
"renamed": summary.renamed,
"pruned": summary.pruned,
"skipped": summary.skipped,
},
)
return {"status": "ok", **summary.__dict__}
def run_pin_invite(self, wait: bool = True) -> dict[str, Any]:
if not settings.comms_seeder_password:
raise RuntimeError("comms seeder password missing")
with self._client() as client:
token = self._login(client, settings.comms_seeder_user, settings.comms_seeder_password)
room_id = self._resolve_alias(client, token, settings.comms_room_alias)
pinned = self._get_pinned(client, token, room_id)
for event_id in pinned:
event = self._get_event(client, token, room_id, event_id)
if event and (event.get("content") or {}).get("body") == settings.comms_pin_message:
return {"status": "ok", "detail": "already pinned"}
event_id = self._send_message(client, token, room_id, settings.comms_pin_message)
if not event_id:
return {"status": "error", "detail": "pin event_id missing"}
self._pin_message(client, token, room_id, event_id)
return {"status": "ok", "detail": "pinned"}
def run_reset_room(self, wait: bool = True) -> dict[str, Any]:
if not settings.comms_seeder_password:
raise RuntimeError("comms seeder password missing")
with self._client() as client:
token = self._login_with_retry(client, settings.comms_seeder_user, settings.comms_seeder_password)
old_room_id = self._resolve_alias(client, token, settings.comms_room_alias)
new_room_id = self._create_room(client, token, settings.comms_room_name)
self._set_room_state(client, token, new_room_id, "m.room.join_rules", {"join_rule": "public"})
self._set_room_state(client, token, new_room_id, "m.room.guest_access", {"guest_access": "can_join"})
self._set_room_state(
client,
token,
new_room_id,
"m.room.history_visibility",
{"history_visibility": "shared"},
)
self._set_room_state(client, token, new_room_id, "m.room.power_levels", self._power_levels())
self._delete_alias(client, token, settings.comms_room_alias)
self._put_alias(client, token, settings.comms_room_alias, new_room_id)
self._set_room_state(
client,
token,
new_room_id,
"m.room.canonical_alias",
{"alias": settings.comms_room_alias},
)
self._set_directory_visibility(client, token, new_room_id, "public")
bot_user_id = _canon_user(settings.comms_bot_user, settings.comms_server_name)
self._invite_user(client, token, new_room_id, bot_user_id)
for uid in self._list_joined_members(client, token, old_room_id):
if uid == _canon_user(settings.comms_seeder_user, settings.comms_server_name):
continue
localpart = uid.split(":", 1)[0].lstrip("@")
if localpart.isdigit():
continue
self._invite_user(client, token, new_room_id, uid)
event_id = self._send_message(client, token, new_room_id, settings.comms_pin_message)
if not event_id:
raise RuntimeError("pin message event_id missing")
self._set_room_state(client, token, new_room_id, "m.room.pinned_events", {"pinned": [event_id]})
self._set_directory_visibility(client, token, old_room_id, "private")
self._set_room_state(client, token, old_room_id, "m.room.join_rules", {"join_rule": "invite"})
self._set_room_state(client, token, old_room_id, "m.room.guest_access", {"guest_access": "forbidden"})
self._set_room_state(
client,
token,
old_room_id,
"m.room.tombstone",
{
"body": "Othrys has been reset. Please join the new room.",
"replacement_room": new_room_id,
},
)
self._send_message(
client,
token,
old_room_id,
"Othrys was reset. Join the new room at https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join",
)
return {"status": "ok", "detail": f"old_room_id={old_room_id} new_room_id={new_room_id}"}
def run_seed_room(self, wait: bool = True) -> dict[str, Any]:
if not settings.comms_seeder_password or not settings.comms_bot_password:
raise RuntimeError("comms seeder/bot password missing")
with self._client() as client:
token = self._login(client, settings.comms_seeder_user, settings.comms_seeder_password)
for user, password, admin in (
(settings.comms_seeder_user, settings.comms_seeder_password, True),
(settings.comms_bot_user, settings.comms_bot_password, False),
):
try:
self._ensure_user(client, token, user, password, admin)
except RuntimeError as exc:
message = str(exc)
if "You are not a server admin" in message:
logger.warning(
"comms seed room ensure skipped",
extra={"event": "comms_seed_room", "user": user, "detail": message},
)
continue
raise
room_id = self._ensure_room(client, token)
self._join_user(client, token, room_id, _canon_user(settings.comms_bot_user, settings.comms_server_name))
self._join_all_locals(client, token, room_id)
return {"status": "ok", "detail": "room seeded"}
def _login(self, client: httpx.Client, user: str, password: str) -> str:
resp = client.post(
f"{settings.comms_auth_base}/_matrix/client/v3/login",
json={
"type": "m.login.password",
"identifier": {"type": "m.id.user", "user": _canon_user(user, settings.comms_server_name)},
"password": password,
},
)
if resp.status_code != HTTP_OK:
raise RuntimeError(f"login failed: {resp.status_code} {resp.text}")
payload = resp.json()
token = payload.get("access_token")
if not isinstance(token, str) or not token:
raise RuntimeError("login missing token")
return token
def _login_with_retry(self, client: httpx.Client, user: str, password: str) -> str:
last: Exception | None = None
for attempt in range(1, 6):
try:
return self._login(client, user, password)
except Exception as exc: # noqa: BLE001
last = exc
time.sleep(attempt * 2)
raise RuntimeError(str(last) if last else "login failed")
def _get_pinned(self, client: httpx.Client, token: str, room_id: str) -> list[str]:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/m.room.pinned_events",
headers=_auth(token),
)
if resp.status_code == HTTP_NOT_FOUND:
return []
resp.raise_for_status()
pinned = resp.json().get("pinned", [])
return [item for item in pinned if isinstance(item, str)]
def _get_event(self, client: httpx.Client, token: str, room_id: str, event_id: str) -> dict[str, Any] | None:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/event/{urllib.parse.quote(event_id)}",
headers=_auth(token),
)
if resp.status_code == HTTP_NOT_FOUND:
return None
resp.raise_for_status()
return resp.json()
def _send_message(self, client: httpx.Client, token: str, room_id: str, body: str) -> str:
resp = client.post(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/send/m.room.message",
headers=_auth(token),
json={"msgtype": "m.text", "body": body},
)
resp.raise_for_status()
payload = resp.json()
event_id = payload.get("event_id")
return event_id if isinstance(event_id, str) else ""
def _pin_message(self, client: httpx.Client, token: str, room_id: str, event_id: str) -> None:
resp = client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/m.room.pinned_events",
headers=_auth(token),
json={"pinned": [event_id]},
)
resp.raise_for_status()
def _create_room(self, client: httpx.Client, token: str, name: str) -> str:
resp = client.post(
f"{settings.comms_synapse_base}/_matrix/client/v3/createRoom",
headers=_auth(token),
json={"preset": "public_chat", "name": name, "room_version": "11"},
)
resp.raise_for_status()
return resp.json()["room_id"]
def _set_room_state(self, client: httpx.Client, token: str, room_id: str, ev_type: str, content: dict[str, Any]) -> None:
resp = client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/{ev_type}",
headers=_auth(token),
json=content,
)
resp.raise_for_status()
def _set_directory_visibility(self, client: httpx.Client, token: str, room_id: str, visibility: str) -> None:
resp = client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/list/room/{urllib.parse.quote(room_id)}",
headers=_auth(token),
json={"visibility": visibility},
)
resp.raise_for_status()
def _delete_alias(self, client: httpx.Client, token: str, alias: str) -> None:
resp = client.delete(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
)
if resp.status_code in (HTTP_OK, HTTP_ACCEPTED, HTTP_NOT_FOUND):
return
resp.raise_for_status()
def _put_alias(self, client: httpx.Client, token: str, alias: str, room_id: str) -> None:
resp = client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
json={"room_id": room_id},
)
resp.raise_for_status()
def _list_joined_members(self, client: httpx.Client, token: str, room_id: str) -> list[str]:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/members?membership=join",
headers=_auth(token),
)
resp.raise_for_status()
members = []
for ev in resp.json().get("chunk", []) or []:
if ev.get("type") != "m.room.member":
continue
uid = ev.get("state_key")
if isinstance(uid, str) and uid.startswith("@"):
members.append(uid)
return members
def _invite_user(self, client: httpx.Client, token: str, room_id: str, user_id: str) -> None:
resp = client.post(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/invite",
headers=_auth(token),
json={"user_id": user_id},
)
if resp.status_code in (HTTP_OK, HTTP_ACCEPTED):
return
resp.raise_for_status()
def _power_levels(self) -> dict[str, Any]:
return {
"ban": 50,
"events": {
"m.room.avatar": 50,
"m.room.canonical_alias": 50,
"m.room.encryption": 100,
"m.room.history_visibility": 100,
"m.room.name": 50,
"m.room.power_levels": 100,
"m.room.server_acl": 100,
"m.room.tombstone": 100,
},
"events_default": 0,
"historical": 100,
"invite": 50,
"kick": 50,
"m.call.invite": 50,
"redact": 50,
"state_default": 50,
"users": { _canon_user(settings.comms_seeder_user, settings.comms_server_name): 100 },
"users_default": 0,
}
def _ensure_user(self, client: httpx.Client, token: str, localpart: str, password: str, admin: bool) -> None:
admin_token = self._admin_token(token)
user_id = _canon_user(localpart, settings.comms_server_name)
url = f"{settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}"
resp = client.get(url, headers=_auth(admin_token))
if resp.status_code == HTTP_OK:
return
payload = {"password": password, "admin": admin, "deactivated": False}
create = client.put(url, headers=_auth(admin_token), json=payload)
if create.status_code not in (HTTP_OK, HTTP_CREATED):
raise RuntimeError(f"create user {user_id} failed: {create.status_code} {create.text}")
def _ensure_room(self, client: httpx.Client, token: str) -> str:
alias = settings.comms_room_alias
alias_enc = urllib.parse.quote(alias)
exists = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/room/{alias_enc}",
headers=_auth(token),
)
if exists.status_code == HTTP_OK:
room_id = exists.json()["room_id"]
else:
create = client.post(
f"{settings.comms_synapse_base}/_matrix/client/v3/createRoom",
headers=_auth(token),
json={
"preset": "public_chat",
"name": settings.comms_room_name,
"room_alias_name": alias.split(":", 1)[0].lstrip("#"),
"initial_state": [],
"power_level_content_override": {
"events_default": 0,
"users_default": 0,
"state_default": 50,
},
},
)
if create.status_code not in (HTTP_OK, HTTP_CONFLICT):
raise RuntimeError(f"create room failed: {create.status_code} {create.text}")
exists = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/room/{alias_enc}",
headers=_auth(token),
)
room_id = exists.json()["room_id"]
state_events = [
("m.room.join_rules", {"join_rule": "public"}),
("m.room.guest_access", {"guest_access": "can_join"}),
("m.room.history_visibility", {"history_visibility": "shared"}),
("m.room.canonical_alias", {"alias": alias}),
]
for ev_type, content in state_events:
client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{room_id}/state/{ev_type}",
headers=_auth(token),
json=content,
)
client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/list/room/{room_id}",
headers=_auth(token),
json={"visibility": "public"},
)
return room_id
def _join_user(self, client: httpx.Client, token: str, room_id: str, user_id: str) -> None:
admin_token = self._admin_token(token)
client.post(
f"{settings.comms_synapse_base}/_synapse/admin/v1/join/{urllib.parse.quote(room_id)}",
headers=_auth(admin_token),
json={"user_id": user_id},
)
def _join_all_locals(self, client: httpx.Client, token: str, room_id: str) -> None:
users: list[str] = []
from_token = None
admin_token = self._admin_token(token)
while True:
url = f"{settings.comms_synapse_base}/_synapse/admin/v2/users?local=true&deactivated=false&limit=100"
if from_token:
url += f"&from={from_token}"
resp = client.get(url, headers=_auth(admin_token))
payload = resp.json()
users.extend([u["name"] for u in payload.get("users", []) if isinstance(u, dict) and u.get("name")])
from_token = payload.get("next_token")
if not from_token:
break
for uid in users:
self._join_user(client, token, room_id, uid)
def _time(self) -> float:
return time.time()
comms = CommsService()
__all__ = ["CommsService", "_canon_user", "comms", "psycopg", "settings"]

View File

@ -0,0 +1,454 @@
from __future__ import annotations
import base64
from typing import Any
import urllib.parse
import httpx
from ..utils.logging import get_logger
from .comms_protocol import (
HTTP_ACCEPTED,
HTTP_CREATED,
HTTP_NO_CONTENT,
HTTP_NOT_FOUND,
HTTP_OK,
CommsSummary,
DisplayNameTarget,
MasGuestResult,
SynapseGuestResult,
SynapseUserRef,
_auth,
_needs_rename_display,
_needs_rename_username,
)
logger = get_logger(__name__)
class _CommsGuestNameMixin:
def _mas_admin_token(self, client: httpx.Client) -> str:
settings = self._settings
if not settings.comms_mas_admin_client_id or not settings.comms_mas_admin_client_secret:
raise RuntimeError("mas admin client credentials missing")
basic = base64.b64encode(
f"{settings.comms_mas_admin_client_id}:{settings.comms_mas_admin_client_secret}".encode()
).decode()
last_err: Exception | None = None
for attempt in range(5):
try:
resp = client.post(
settings.comms_mas_token_url,
headers={"Authorization": f"Basic {basic}"},
data={"grant_type": "client_credentials", "scope": "urn:mas:admin"},
)
resp.raise_for_status()
payload = resp.json()
token = payload.get("access_token")
if not isinstance(token, str) or not token:
raise RuntimeError("missing mas access token")
return token
except Exception as exc: # noqa: BLE001
last_err = exc
self._sleep(2**attempt)
raise RuntimeError(str(last_err) if last_err else "mas admin token failed")
def _mas_user_id(self, client: httpx.Client, token: str, username: str) -> str:
url = f"{self._settings.comms_mas_admin_api_base}/users/by-username/{urllib.parse.quote(username)}"
resp = client.get(url, headers=_auth(token))
resp.raise_for_status()
payload = resp.json()
return payload["data"]["id"]
def _mas_personal_session(self, client: httpx.Client, token: str, user_id: str) -> tuple[str, str]:
resp = client.post(
f"{self._settings.comms_mas_admin_api_base}/personal-sessions",
headers=_auth(token),
json={
"actor_user_id": user_id,
"human_name": "guest-name-randomizer",
"scope": "urn:matrix:client:api:*",
"expires_in": 300,
},
)
resp.raise_for_status()
payload = resp.json().get("data", {})
session_id = payload.get("id")
attrs = (payload.get("attributes") or {}) if isinstance(payload, dict) else {}
access_token = attrs.get("access_token")
if not isinstance(access_token, str) or not isinstance(session_id, str):
raise RuntimeError("invalid personal session response")
return access_token, session_id
def _mas_revoke_session(self, client: httpx.Client, token: str, session_id: str) -> None:
try:
client.post(
f"{self._settings.comms_mas_admin_api_base}/personal-sessions/{urllib.parse.quote(session_id)}/revoke",
headers=_auth(token),
json={},
)
except Exception:
return
def _room_members(self, client: httpx.Client, token: str, room_id: str) -> tuple[set[str], set[str]]:
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/members",
headers=_auth(token),
)
resp.raise_for_status()
payload = resp.json()
members: set[str] = set()
existing: set[str] = set()
for ev in payload.get("chunk", []) or []:
user_id = ev.get("state_key")
if isinstance(user_id, str) and user_id:
members.add(user_id)
display = (ev.get("content") or {}).get("displayname")
if isinstance(display, str) and display:
existing.add(display)
return members, existing
def _mas_list_users(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
users: list[dict[str, Any]] = []
cursor = None
while True:
url = f"{self._settings.comms_mas_admin_api_base}/users?page[size]=100"
if cursor:
url += f"&page[after]={urllib.parse.quote(cursor)}"
resp = client.get(url, headers=_auth(token))
resp.raise_for_status()
payload = resp.json()
data = payload.get("data") or []
if not isinstance(data, list) or not data:
break
users.extend([item for item in data if isinstance(item, dict)])
last = data[-1]
cursor = (
last.get("meta", {})
if isinstance(last, dict)
else {}
).get("page", {}).get("cursor")
if not cursor:
break
return users
def _synapse_list_users(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
users: list[dict[str, Any]] = []
from_token = None
admin_token = self._admin_token(token)
while True:
url = "{}/_synapse/admin/v2/users?local=true&deactivated=false&limit=100".format(
self._settings.comms_synapse_base
)
if from_token:
url += f"&from={urllib.parse.quote(from_token)}"
resp = client.get(url, headers=_auth(admin_token))
resp.raise_for_status()
payload = resp.json()
users.extend([item for item in payload.get("users", []) if isinstance(item, dict)])
from_token = payload.get("next_token")
if not from_token:
break
return users
def _should_prune_guest(self, entry: dict[str, Any], now_ms: int) -> bool:
if not entry.get("is_guest"):
return False
last_seen = entry.get("last_seen_ts")
if last_seen is None:
return False
try:
last_seen = int(last_seen)
except (TypeError, ValueError):
return False
stale_ms = int(self._settings.comms_guest_stale_days) * 24 * 60 * 60 * 1000
return now_ms - last_seen > stale_ms
def _prune_guest(self, client: httpx.Client, token: str, user_id: str) -> bool:
admin_token = self._admin_token(token)
try:
resp = client.delete(
f"{self._settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
params={"erase": "true"},
)
except Exception as exc: # noqa: BLE001
logger.info(
"guest prune failed",
extra={"event": "comms_guest_prune", "status": "error", "detail": str(exc)},
)
return False
if resp.status_code in (HTTP_OK, HTTP_ACCEPTED, HTTP_NO_CONTENT, HTTP_NOT_FOUND):
return True
logger.info(
"guest prune failed",
extra={
"event": "comms_guest_prune",
"status": "error",
"detail": f"{resp.status_code} {resp.text}",
},
)
return False
def _get_displayname(self, client: httpx.Client, token: str, user_id: str) -> str | None:
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/profile/{urllib.parse.quote(user_id)}",
headers=_auth(token),
)
resp.raise_for_status()
return resp.json().get("displayname")
def _get_displayname_admin(self, client: httpx.Client, token: str, user_id: str) -> str | None:
admin_token = self._admin_token(token)
resp = client.get(
f"{self._settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
)
if resp.status_code == HTTP_NOT_FOUND:
return None
resp.raise_for_status()
return resp.json().get("displayname")
def _set_displayname(self, client: httpx.Client, token: str, target: DisplayNameTarget) -> None:
resp = client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/profile/{urllib.parse.quote(target.user_id)}/displayname",
headers=_auth(token),
json={"displayname": target.name},
)
resp.raise_for_status()
if not target.in_room:
return
state_url = (
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(target.room_id)}"
f"/state/m.room.member/{urllib.parse.quote(target.user_id)}"
)
client.put(
state_url,
headers=_auth(token),
json={"membership": "join", "displayname": target.name},
)
def _set_displayname_admin(self, client: httpx.Client, token: str, user_id: str, name: str) -> bool:
admin_token = self._admin_token(token)
resp = client.put(
f"{self._settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
json={"displayname": name},
)
return resp.status_code in (HTTP_OK, HTTP_CREATED, HTTP_NO_CONTENT)
def _db_rename_numeric(self, existing: set[str]) -> int:
settings = self._settings
if not settings.comms_synapse_db_password:
return 0
renamed = 0
conn = self._connect_synapse_db()
try:
with conn:
with conn.cursor() as cur:
pattern = f"^@\\d+:{settings.comms_server_name}$"
cur.execute(
"SELECT user_id, full_user_id, displayname FROM profiles WHERE full_user_id ~ %s",
(pattern,),
)
profile_rows = cur.fetchall()
profile_index = {row[1]: row for row in profile_rows}
for _user_id, full_user_id, display in profile_rows:
if display and not _needs_rename_display(display):
continue
new_name = self._pick_guest_name(existing)
if not new_name:
continue
cur.execute(
"UPDATE profiles SET displayname = %s WHERE full_user_id = %s",
(new_name, full_user_id),
)
renamed += 1
cur.execute(
"SELECT name FROM users WHERE name ~ %s",
(pattern,),
)
users = [row[0] for row in cur.fetchall()]
if not users:
return renamed
cur.execute(
"SELECT user_id, full_user_id FROM profiles WHERE full_user_id = ANY(%s)",
(users,),
)
for existing_full in cur.fetchall():
profile_index.setdefault(existing_full[1], existing_full)
for full_user_id in users:
if full_user_id in profile_index:
continue
localpart = full_user_id.split(":", 1)[0].lstrip("@")
new_name = self._pick_guest_name(existing)
if not new_name:
continue
cur.execute(
"INSERT INTO profiles (user_id, displayname, full_user_id) VALUES (%s, %s, %s) "
"ON CONFLICT (full_user_id) DO UPDATE SET displayname = EXCLUDED.displayname",
(localpart, new_name, full_user_id),
)
renamed += 1
finally:
conn.close()
return renamed
def _validate_guest_name_settings(self) -> None:
if not self._settings.comms_mas_admin_client_id or not self._settings.comms_mas_admin_client_secret:
raise RuntimeError("comms mas admin secret missing")
if not self._settings.comms_synapse_base:
raise RuntimeError("comms synapse base missing")
def _room_context(self, client: httpx.Client, token: str) -> tuple[str, set[str], set[str]]:
room_id = self._resolve_alias(client, token, self._settings.comms_room_alias)
members, existing = self._room_members(client, token, room_id)
return room_id, members, existing
def _rename_mas_guests(self, client: httpx.Client, admin_token: str, room_id: str, members: set[str], existing: set[str]) -> MasGuestResult:
renamed = 0
skipped = 0
mas_usernames: set[str] = set()
users = self._mas_list_users(client, admin_token)
for user in users:
attrs = user.get("attributes") or {}
username = attrs.get("username") or ""
if isinstance(username, str) and username:
mas_usernames.add(username)
legacy_guest = attrs.get("legacy_guest")
if not isinstance(username, str) or not username:
skipped += 1
continue
if not (legacy_guest or _needs_rename_username(username)):
skipped += 1
continue
user_id = user.get("id")
if not isinstance(user_id, str) or not user_id:
skipped += 1
continue
full_user = f"@{username}:{self._settings.comms_server_name}"
access_token, session_id = self._mas_personal_session(client, admin_token, user_id)
try:
display = self._get_displayname(client, access_token, full_user)
if display and not _needs_rename_display(display):
skipped += 1
continue
new_name = self._pick_guest_name(existing)
if not new_name:
skipped += 1
continue
self._set_displayname(
client,
access_token,
DisplayNameTarget(
room_id=room_id,
user_id=full_user,
name=new_name,
in_room=full_user in members,
),
)
renamed += 1
finally:
self._mas_revoke_session(client, admin_token, session_id)
return MasGuestResult(renamed=renamed, skipped=skipped, usernames=mas_usernames)
def _synapse_entries(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
try:
return self._synapse_list_users(client, token)
except Exception as exc: # noqa: BLE001
logger.info(
"synapse admin list skipped",
extra={"event": "comms_guest_list", "status": "error", "detail": str(exc)},
)
return []
def _synapse_user_id(self, entry: dict[str, Any]) -> SynapseUserRef | None:
user_id = entry.get("name") or ""
if not isinstance(user_id, str) or not user_id.startswith("@"):
return None
localpart = user_id.split(":", 1)[0].lstrip("@")
return SynapseUserRef(entry=entry, user_id=user_id, localpart=localpart)
def _maybe_prune_synapse_guest(self, client: httpx.Client, token: str, entry: dict[str, Any], user_id: str, now_ms: int) -> bool:
if not entry.get("is_guest"):
return False
if not self._should_prune_guest(entry, now_ms):
return False
return self._prune_guest(client, token, user_id)
def _needs_synapse_rename(self, client: httpx.Client, token: str, user: SynapseUserRef, mas_usernames: set[str]) -> bool:
if user.localpart in mas_usernames:
return False
is_guest = user.entry.get("is_guest")
if not (is_guest or _needs_rename_username(user.localpart)):
return False
display = self._get_displayname_admin(client, token, user.user_id)
if display and not _needs_rename_display(display):
return False
return True
def _rename_synapse_user(self, client: httpx.Client, token: str, existing: set[str], user_id: str) -> bool:
new_name = self._pick_guest_name(existing)
if not new_name:
return False
return self._set_displayname_admin(client, token, user_id, new_name)
def _rename_synapse_guests(self, client: httpx.Client, token: str, existing: set[str], mas_usernames: set[str]) -> SynapseGuestResult:
renamed = 0
pruned = 0
entries = self._synapse_entries(client, token)
now_ms = int(self._time() * 1000)
for entry in entries:
user_ref = self._synapse_user_id(entry)
if not user_ref:
continue
if self._maybe_prune_synapse_guest(client, token, user_ref.entry, user_ref.user_id, now_ms):
pruned += 1
continue
if not self._needs_synapse_rename(client, token, user_ref, mas_usernames):
continue
if self._rename_synapse_user(client, token, existing, user_ref.user_id):
renamed += 1
return SynapseGuestResult(renamed=renamed, pruned=pruned)
def run_guest_name_randomizer(self, wait: bool = True) -> dict[str, Any]:
self._validate_guest_name_settings()
with self._client() as client:
admin_token = self._mas_admin_token(client)
seeder_id = self._mas_user_id(client, admin_token, self._settings.comms_seeder_user)
seeder_token, seeder_session = self._mas_personal_session(client, admin_token, seeder_id)
try:
room_id, members, existing = self._room_context(client, seeder_token)
mas_result = self._rename_mas_guests(client, admin_token, room_id, members, existing)
synapse_result = self._rename_synapse_guests(
client,
seeder_token,
existing,
mas_result.usernames,
)
db_renamed = self._db_rename_numeric(existing)
finally:
self._mas_revoke_session(client, admin_token, seeder_session)
renamed = mas_result.renamed + synapse_result.renamed + db_renamed
pruned = synapse_result.pruned
skipped = mas_result.skipped
processed = renamed + pruned + skipped
summary = CommsSummary(processed, renamed, pruned, skipped)
logger.info(
"comms guest name sync finished",
extra={
"event": "comms_guest_name",
"status": "ok",
"processed": summary.processed,
"renamed": summary.renamed,
"pruned": summary.pruned,
"skipped": summary.skipped,
},
)
return {"status": "ok", **summary.__dict__}

View File

@ -0,0 +1,72 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
HTTP_OK = 200
HTTP_CREATED = 201
HTTP_ACCEPTED = 202
HTTP_NO_CONTENT = 204
HTTP_NOT_FOUND = 404
HTTP_CONFLICT = 409
@dataclass(frozen=True)
class CommsSummary:
processed: int
renamed: int
pruned: int
skipped: int
detail: str = ""
@dataclass(frozen=True)
class MasGuestResult:
renamed: int
skipped: int
usernames: set[str]
@dataclass(frozen=True)
class SynapseGuestResult:
renamed: int
pruned: int
@dataclass(frozen=True)
class DisplayNameTarget:
room_id: str
user_id: str
name: str
in_room: bool
@dataclass(frozen=True)
class SynapseUserRef:
entry: dict[str, Any]
user_id: str
localpart: str
def _auth(token: str) -> dict[str, str]:
return {"Authorization": f"Bearer {token}"}
def _canon_user(user: str, server_name: str) -> str:
user = (user or "").strip()
if user.startswith("@") and ":" in user:
return user
user = user.lstrip("@")
if ":" in user:
return f"@{user}"
return f"@{user}:{server_name}"
def _needs_rename_username(username: str) -> bool:
return username.isdigit() or username.startswith("guest-")
def _needs_rename_display(display: str | None) -> bool:
if not display:
return True
return display.isdigit() or display.startswith("guest-")

View File

@ -0,0 +1,389 @@
from __future__ import annotations
from typing import Any
import urllib.parse
import httpx
from ..utils.logging import get_logger
from .comms_protocol import (
HTTP_ACCEPTED,
HTTP_CONFLICT,
HTTP_CREATED,
HTTP_NOT_FOUND,
HTTP_OK,
_auth,
_canon_user,
)
logger = get_logger(__name__)
class _CommsRoomOpsMixin:
def run_pin_invite(self, wait: bool = True) -> dict[str, Any]:
if not self._settings.comms_seeder_password:
raise RuntimeError("comms seeder password missing")
with self._client() as client:
token = self._login(client, self._settings.comms_seeder_user, self._settings.comms_seeder_password)
room_id = self._resolve_alias(client, token, self._settings.comms_room_alias)
pinned = self._get_pinned(client, token, room_id)
for event_id in pinned:
event = self._get_event(client, token, room_id, event_id)
if event and (event.get("content") or {}).get("body") == self._settings.comms_pin_message:
return {"status": "ok", "detail": "already pinned"}
event_id = self._send_message(client, token, room_id, self._settings.comms_pin_message)
if not event_id:
return {"status": "error", "detail": "pin event_id missing"}
self._pin_message(client, token, room_id, event_id)
return {"status": "ok", "detail": "pinned"}
def run_reset_room(self, wait: bool = True) -> dict[str, Any]:
if not self._settings.comms_seeder_password:
raise RuntimeError("comms seeder password missing")
with self._client() as client:
token = self._login_with_retry(client, self._settings.comms_seeder_user, self._settings.comms_seeder_password)
old_room_id = self._resolve_alias(client, token, self._settings.comms_room_alias)
new_room_id = self._create_room(client, token, self._settings.comms_room_name)
self._set_room_state(client, token, new_room_id, "m.room.join_rules", {"join_rule": "public"})
self._set_room_state(client, token, new_room_id, "m.room.guest_access", {"guest_access": "can_join"})
self._set_room_state(
client,
token,
new_room_id,
"m.room.history_visibility",
{"history_visibility": "shared"},
)
self._set_room_state(client, token, new_room_id, "m.room.power_levels", self._power_levels())
self._delete_alias(client, token, self._settings.comms_room_alias)
self._put_alias(client, token, self._settings.comms_room_alias, new_room_id)
self._set_room_state(
client,
token,
new_room_id,
"m.room.canonical_alias",
{"alias": self._settings.comms_room_alias},
)
self._set_directory_visibility(client, token, new_room_id, "public")
bot_user_id = _canon_user(self._settings.comms_bot_user, self._settings.comms_server_name)
self._invite_user(client, token, new_room_id, bot_user_id)
for uid in self._list_joined_members(client, token, old_room_id):
if uid == _canon_user(self._settings.comms_seeder_user, self._settings.comms_server_name):
continue
localpart = uid.split(":", 1)[0].lstrip("@")
if localpart.isdigit():
continue
self._invite_user(client, token, new_room_id, uid)
event_id = self._send_message(client, token, new_room_id, self._settings.comms_pin_message)
if not event_id:
raise RuntimeError("pin message event_id missing")
self._set_room_state(client, token, new_room_id, "m.room.pinned_events", {"pinned": [event_id]})
self._set_directory_visibility(client, token, old_room_id, "private")
self._set_room_state(client, token, old_room_id, "m.room.join_rules", {"join_rule": "invite"})
self._set_room_state(client, token, old_room_id, "m.room.guest_access", {"guest_access": "forbidden"})
self._set_room_state(
client,
token,
old_room_id,
"m.room.tombstone",
{
"body": "Othrys has been reset. Please join the new room.",
"replacement_room": new_room_id,
},
)
self._send_message(
client,
token,
old_room_id,
"Othrys was reset. Join the new room at https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join",
)
return {"status": "ok", "detail": f"old_room_id={old_room_id} new_room_id={new_room_id}"}
def run_seed_room(self, wait: bool = True) -> dict[str, Any]:
if not self._settings.comms_seeder_password or not self._settings.comms_bot_password:
raise RuntimeError("comms seeder/bot password missing")
with self._client() as client:
token = self._login(client, self._settings.comms_seeder_user, self._settings.comms_seeder_password)
for user, password, admin in (
(self._settings.comms_seeder_user, self._settings.comms_seeder_password, True),
(self._settings.comms_bot_user, self._settings.comms_bot_password, False),
):
try:
self._ensure_user(client, token, user, password, admin)
except RuntimeError as exc:
message = str(exc)
if "You are not a server admin" in message:
logger.warning(
"comms seed room ensure skipped",
extra={"event": "comms_seed_room", "user": user, "detail": message},
)
continue
raise
room_id = self._ensure_room(client, token)
self._join_user(client, token, room_id, _canon_user(self._settings.comms_bot_user, self._settings.comms_server_name))
self._join_all_locals(client, token, room_id)
return {"status": "ok", "detail": "room seeded"}
def _login(self, client: httpx.Client, user: str, password: str) -> str:
resp = client.post(
f"{self._settings.comms_auth_base}/_matrix/client/v3/login",
json={
"type": "m.login.password",
"identifier": {"type": "m.id.user", "user": _canon_user(user, self._settings.comms_server_name)},
"password": password,
},
)
if resp.status_code != HTTP_OK:
raise RuntimeError(f"login failed: {resp.status_code} {resp.text}")
payload = resp.json()
token = payload.get("access_token")
if not isinstance(token, str) or not token:
raise RuntimeError("login missing token")
return token
def _login_with_retry(self, client: httpx.Client, user: str, password: str) -> str:
last: Exception | None = None
for attempt in range(1, 6):
try:
return self._login(client, user, password)
except Exception as exc: # noqa: BLE001
last = exc
self._sleep(attempt * 2)
raise RuntimeError(str(last) if last else "login failed")
def _resolve_alias(self, client: httpx.Client, token: str, alias: str) -> str:
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
)
resp.raise_for_status()
payload = resp.json()
return payload["room_id"]
def _get_pinned(self, client: httpx.Client, token: str, room_id: str) -> list[str]:
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/m.room.pinned_events",
headers=_auth(token),
)
if resp.status_code == HTTP_NOT_FOUND:
return []
resp.raise_for_status()
pinned = resp.json().get("pinned", [])
return [item for item in pinned if isinstance(item, str)]
def _get_event(self, client: httpx.Client, token: str, room_id: str, event_id: str) -> dict[str, Any] | None:
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/event/{urllib.parse.quote(event_id)}",
headers=_auth(token),
)
if resp.status_code == HTTP_NOT_FOUND:
return None
resp.raise_for_status()
return resp.json()
def _send_message(self, client: httpx.Client, token: str, room_id: str, body: str) -> str:
resp = client.post(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/send/m.room.message",
headers=_auth(token),
json={"msgtype": "m.text", "body": body},
)
resp.raise_for_status()
payload = resp.json()
event_id = payload.get("event_id")
return event_id if isinstance(event_id, str) else ""
def _pin_message(self, client: httpx.Client, token: str, room_id: str, event_id: str) -> None:
resp = client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/m.room.pinned_events",
headers=_auth(token),
json={"pinned": [event_id]},
)
resp.raise_for_status()
def _create_room(self, client: httpx.Client, token: str, name: str) -> str:
resp = client.post(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/createRoom",
headers=_auth(token),
json={"preset": "public_chat", "name": name, "room_version": "11"},
)
resp.raise_for_status()
return resp.json()["room_id"]
def _set_room_state(self, client: httpx.Client, token: str, room_id: str, ev_type: str, content: dict[str, Any]) -> None:
resp = client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/{ev_type}",
headers=_auth(token),
json=content,
)
resp.raise_for_status()
def _set_directory_visibility(self, client: httpx.Client, token: str, room_id: str, visibility: str) -> None:
resp = client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/list/room/{urllib.parse.quote(room_id)}",
headers=_auth(token),
json={"visibility": visibility},
)
resp.raise_for_status()
def _delete_alias(self, client: httpx.Client, token: str, alias: str) -> None:
resp = client.delete(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
)
if resp.status_code in (HTTP_OK, HTTP_ACCEPTED, HTTP_NOT_FOUND):
return
resp.raise_for_status()
def _put_alias(self, client: httpx.Client, token: str, alias: str, room_id: str) -> None:
resp = client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
json={"room_id": room_id},
)
resp.raise_for_status()
def _list_joined_members(self, client: httpx.Client, token: str, room_id: str) -> list[str]:
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/members?membership=join",
headers=_auth(token),
)
resp.raise_for_status()
members = []
for ev in resp.json().get("chunk", []) or []:
if ev.get("type") != "m.room.member":
continue
uid = ev.get("state_key")
if isinstance(uid, str) and uid.startswith("@"):
members.append(uid)
return members
def _invite_user(self, client: httpx.Client, token: str, room_id: str, user_id: str) -> None:
resp = client.post(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/invite",
headers=_auth(token),
json={"user_id": user_id},
)
if resp.status_code in (HTTP_OK, HTTP_ACCEPTED):
return
resp.raise_for_status()
def _power_levels(self) -> dict[str, Any]:
return {
"ban": 50,
"events": {
"m.room.avatar": 50,
"m.room.canonical_alias": 50,
"m.room.encryption": 100,
"m.room.history_visibility": 100,
"m.room.name": 50,
"m.room.power_levels": 100,
"m.room.server_acl": 100,
"m.room.tombstone": 100,
},
"events_default": 0,
"historical": 100,
"invite": 50,
"kick": 50,
"m.call.invite": 50,
"redact": 50,
"state_default": 50,
"users": {_canon_user(self._settings.comms_seeder_user, self._settings.comms_server_name): 100},
"users_default": 0,
}
def _ensure_user(self, client: httpx.Client, token: str, localpart: str, password: str, admin: bool) -> None:
admin_token = self._admin_token(token)
user_id = _canon_user(localpart, self._settings.comms_server_name)
url = f"{self._settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}"
resp = client.get(url, headers=_auth(admin_token))
if resp.status_code == HTTP_OK:
return
payload = {"password": password, "admin": admin, "deactivated": False}
create = client.put(url, headers=_auth(admin_token), json=payload)
if create.status_code not in (HTTP_OK, HTTP_CREATED):
raise RuntimeError(f"create user {user_id} failed: {create.status_code} {create.text}")
def _ensure_room(self, client: httpx.Client, token: str) -> str:
alias = self._settings.comms_room_alias
alias_enc = urllib.parse.quote(alias)
exists = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/room/{alias_enc}",
headers=_auth(token),
)
if exists.status_code == HTTP_OK:
room_id = exists.json()["room_id"]
else:
create = client.post(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/createRoom",
headers=_auth(token),
json={
"preset": "public_chat",
"name": self._settings.comms_room_name,
"room_alias_name": alias.split(":", 1)[0].lstrip("#"),
"initial_state": [],
"power_level_content_override": {
"events_default": 0,
"users_default": 0,
"state_default": 50,
},
},
)
if create.status_code not in (HTTP_OK, HTTP_CONFLICT):
raise RuntimeError(f"create room failed: {create.status_code} {create.text}")
exists = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/room/{alias_enc}",
headers=_auth(token),
)
room_id = exists.json()["room_id"]
state_events = [
("m.room.join_rules", {"join_rule": "public"}),
("m.room.guest_access", {"guest_access": "can_join"}),
("m.room.history_visibility", {"history_visibility": "shared"}),
("m.room.canonical_alias", {"alias": alias}),
]
for ev_type, content in state_events:
client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{room_id}/state/{ev_type}",
headers=_auth(token),
json=content,
)
client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/list/room/{room_id}",
headers=_auth(token),
json={"visibility": "public"},
)
return room_id
def _join_user(self, client: httpx.Client, token: str, room_id: str, user_id: str) -> None:
admin_token = self._admin_token(token)
client.post(
f"{self._settings.comms_synapse_base}/_synapse/admin/v1/join/{urllib.parse.quote(room_id)}",
headers=_auth(admin_token),
json={"user_id": user_id},
)
def _join_all_locals(self, client: httpx.Client, token: str, room_id: str) -> None:
users: list[str] = []
from_token = None
admin_token = self._admin_token(token)
while True:
url = f"{self._settings.comms_synapse_base}/_synapse/admin/v2/users?local=true&deactivated=false&limit=100"
if from_token:
url += f"&from={from_token}"
resp = client.get(url, headers=_auth(admin_token))
payload = resp.json()
users.extend([u["name"] for u in payload.get("users", []) if isinstance(u, dict) and u.get("name")])
from_token = payload.get("next_token")
if not from_token:
break
for uid in users:
self._join_user(client, token, room_id, uid)

View File

@ -2,10 +2,8 @@ from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any
import textwrap
import httpx
from typing import Any
from ..k8s.exec import ExecError, PodExecutor
from ..k8s.pods import PodSelectionError
@ -13,6 +11,8 @@ from ..settings import settings
from ..utils.logging import get_logger
from ..utils.passwords import random_password
from .keycloak_admin import keycloak_admin
from .firefly_scripts import FIREFLY_PASSWORD_CHECK_SCRIPT as _FIREFLY_PASSWORD_CHECK_SCRIPT
from .firefly_scripts import FIREFLY_SYNC_SCRIPT as _FIREFLY_SYNC_SCRIPT
from .mailu import mailu
@ -27,230 +27,6 @@ FIREFLY_PASSWORD_ROTATED_ATTR = "firefly_password_rotated_at"
logger = get_logger(__name__)
_FIREFLY_SYNC_SCRIPT = textwrap.dedent(
"""
<?php
declare(strict_types=1);
use FireflyIII\\Console\\Commands\\Correction\\CreatesGroupMemberships;
use FireflyIII\\Models\\Role;
use FireflyIII\\Repositories\\User\\UserRepositoryInterface;
use FireflyIII\\Support\\Facades\\FireflyConfig;
use FireflyIII\\User;
use Illuminate\\Contracts\\Console\\Kernel as ConsoleKernel;
function log_line(string $message): void
{
fwrite(STDOUT, $message . PHP_EOL);
}
function error_line(string $message): void
{
fwrite(STDERR, $message . PHP_EOL);
}
function find_app_root(): string
{
$candidates = [];
$env_root = getenv('FIREFLY_APP_DIR') ?: '';
if ($env_root !== '') {
$candidates[] = $env_root;
}
$candidates[] = '/var/www/html';
$candidates[] = '/var/www/firefly-iii';
$candidates[] = '/app';
foreach ($candidates as $candidate) {
if (!is_dir($candidate)) {
continue;
}
if (file_exists($candidate . '/vendor/autoload.php')) {
return $candidate;
}
}
return '';
}
$email = trim((string) getenv('FIREFLY_USER_EMAIL'));
$password = (string) getenv('FIREFLY_USER_PASSWORD');
if ($email === '' || $password === '') {
error_line('missing FIREFLY_USER_EMAIL or FIREFLY_USER_PASSWORD');
exit(1);
}
$root = find_app_root();
if ($root === '') {
error_line('firefly app root not found');
exit(1);
}
$autoload = $root . '/vendor/autoload.php';
$app_bootstrap = $root . '/bootstrap/app.php';
if (!file_exists($autoload) || !file_exists($app_bootstrap)) {
error_line('firefly bootstrap files missing');
exit(1);
}
require $autoload;
$app = require $app_bootstrap;
$kernel = $app->make(ConsoleKernel::class);
$kernel->bootstrap();
try {
FireflyConfig::set('single_user_mode', true);
} catch (Throwable $exc) {
error_line('failed to enforce single_user_mode: ' . $exc->getMessage());
}
$repository = $app->make(UserRepositoryInterface::class);
$existing_user = User::where('email', $email)->first();
$first_user = User::count() == 0;
if (!$existing_user) {
$existing_user = User::create(
[
'email' => $email,
'password' => bcrypt($password),
'blocked' => false,
'blocked_code' => null,
]
);
if ($first_user) {
$role = Role::where('name', 'owner')->first();
if ($role) {
$existing_user->roles()->attach($role);
}
}
log_line(sprintf('created firefly user %s', $email));
} else {
log_line(sprintf('updating firefly user %s', $email));
}
$existing_user->blocked = false;
$existing_user->blocked_code = null;
$existing_user->save();
$repository->changePassword($existing_user, $password);
CreatesGroupMemberships::createGroupMembership($existing_user);
log_line('firefly user sync complete');
"""
).strip()
_FIREFLY_PASSWORD_CHECK_SCRIPT = textwrap.dedent(
"""
<?php
declare(strict_types=1);
use FireflyIII\\Support\\Facades\\FireflyConfig;
use FireflyIII\\User;
use Illuminate\\Contracts\\Console\\Kernel as ConsoleKernel;
use Illuminate\\Support\\Facades\\Hash;
function log_line(string $message): void
{
fwrite(STDOUT, $message . PHP_EOL);
}
function error_line(string $message): void
{
fwrite(STDERR, $message . PHP_EOL);
}
function find_app_root(): string
{
$candidates = [];
$env_root = getenv('FIREFLY_APP_DIR') ?: '';
if ($env_root !== '') {
$candidates[] = $env_root;
}
$candidates[] = '/var/www/html';
$candidates[] = '/var/www/firefly-iii';
$candidates[] = '/app';
foreach ($candidates as $candidate) {
if (!is_dir($candidate)) {
continue;
}
if (file_exists($candidate . '/vendor/autoload.php')) {
return $candidate;
}
}
return '';
}
$email = trim((string) getenv('FIREFLY_USER_EMAIL'));
$username = trim((string) getenv('FIREFLY_USER_USERNAME'));
$password = (string) getenv('FIREFLY_USER_PASSWORD');
if (($email === '' && $username === '') || $password === '') {
error_line('missing FIREFLY_USER_EMAIL or FIREFLY_USER_USERNAME or FIREFLY_USER_PASSWORD');
exit(2);
}
$root = find_app_root();
if ($root === '') {
error_line('firefly app root not found');
exit(2);
}
$autoload = $root . '/vendor/autoload.php';
$app_bootstrap = $root . '/bootstrap/app.php';
if (!file_exists($autoload) || !file_exists($app_bootstrap)) {
error_line('firefly bootstrap files missing');
exit(2);
}
require $autoload;
$app = require $app_bootstrap;
$kernel = $app->make(ConsoleKernel::class);
$kernel->bootstrap();
try {
FireflyConfig::set('single_user_mode', true);
} catch (Throwable $exc) {
error_line('failed to enforce single_user_mode: ' . $exc->getMessage());
}
if ($email !== '') {
$query = User::where('email', $email);
} else {
$query = User::where('username', $username);
}
if ($email !== '' && $username !== '') {
$query = $query->orWhere('username', $username);
}
$existing_user = $query->first();
if (!$existing_user) {
error_line('firefly user missing');
exit(3);
}
if (Hash::check($password, $existing_user->password)) {
log_line('password match');
exit(0);
}
log_line('password mismatch');
exit(1);
"""
).strip()
def _firefly_exec_command() -> str:
return f"php <<'PHP'\n{_FIREFLY_SYNC_SCRIPT}\nPHP"
@ -498,6 +274,8 @@ def _rotation_check_input(username: str) -> tuple[FireflySyncInput | UserSyncOut
class FireflyService:
"""Synchronize Keycloak users and password rotations into Firefly."""
def __init__(self) -> None:
self._executor = PodExecutor(
settings.firefly_namespace,

View File

@ -0,0 +1,230 @@
"""Embedded scripts executed inside the firefly application pod."""
from __future__ import annotations
import textwrap
FIREFLY_SYNC_SCRIPT = textwrap.dedent(
"""
<?php
declare(strict_types=1);
use FireflyIII\\Console\\Commands\\Correction\\CreatesGroupMemberships;
use FireflyIII\\Models\\Role;
use FireflyIII\\Repositories\\User\\UserRepositoryInterface;
use FireflyIII\\Support\\Facades\\FireflyConfig;
use FireflyIII\\User;
use Illuminate\\Contracts\\Console\\Kernel as ConsoleKernel;
function log_line(string $message): void
{
fwrite(STDOUT, $message . PHP_EOL);
}
function error_line(string $message): void
{
fwrite(STDERR, $message . PHP_EOL);
}
function find_app_root(): string
{
$candidates = [];
$env_root = getenv('FIREFLY_APP_DIR') ?: '';
if ($env_root !== '') {
$candidates[] = $env_root;
}
$candidates[] = '/var/www/html';
$candidates[] = '/var/www/firefly-iii';
$candidates[] = '/app';
foreach ($candidates as $candidate) {
if (!is_dir($candidate)) {
continue;
}
if (file_exists($candidate . '/vendor/autoload.php')) {
return $candidate;
}
}
return '';
}
$email = trim((string) getenv('FIREFLY_USER_EMAIL'));
$password = (string) getenv('FIREFLY_USER_PASSWORD');
if ($email === '' || $password === '') {
error_line('missing FIREFLY_USER_EMAIL or FIREFLY_USER_PASSWORD');
exit(1);
}
$root = find_app_root();
if ($root === '') {
error_line('firefly app root not found');
exit(1);
}
$autoload = $root . '/vendor/autoload.php';
$app_bootstrap = $root . '/bootstrap/app.php';
if (!file_exists($autoload) || !file_exists($app_bootstrap)) {
error_line('firefly bootstrap files missing');
exit(1);
}
require $autoload;
$app = require $app_bootstrap;
$kernel = $app->make(ConsoleKernel::class);
$kernel->bootstrap();
try {
FireflyConfig::set('single_user_mode', true);
} catch (Throwable $exc) {
error_line('failed to enforce single_user_mode: ' . $exc->getMessage());
}
$repository = $app->make(UserRepositoryInterface::class);
$existing_user = User::where('email', $email)->first();
$first_user = User::count() == 0;
if (!$existing_user) {
$existing_user = User::create(
[
'email' => $email,
'password' => bcrypt($password),
'blocked' => false,
'blocked_code' => null,
]
);
if ($first_user) {
$role = Role::where('name', 'owner')->first();
if ($role) {
$existing_user->roles()->attach($role);
}
}
log_line(sprintf('created firefly user %s', $email));
} else {
log_line(sprintf('updating firefly user %s', $email));
}
$existing_user->blocked = false;
$existing_user->blocked_code = null;
$existing_user->save();
$repository->changePassword($existing_user, $password);
CreatesGroupMemberships::createGroupMembership($existing_user);
log_line('firefly user sync complete');
"""
).strip()
FIREFLY_PASSWORD_CHECK_SCRIPT = textwrap.dedent(
"""
<?php
declare(strict_types=1);
use FireflyIII\\Support\\Facades\\FireflyConfig;
use FireflyIII\\User;
use Illuminate\\Contracts\\Console\\Kernel as ConsoleKernel;
use Illuminate\\Support\\Facades\\Hash;
function log_line(string $message): void
{
fwrite(STDOUT, $message . PHP_EOL);
}
function error_line(string $message): void
{
fwrite(STDERR, $message . PHP_EOL);
}
function find_app_root(): string
{
$candidates = [];
$env_root = getenv('FIREFLY_APP_DIR') ?: '';
if ($env_root !== '') {
$candidates[] = $env_root;
}
$candidates[] = '/var/www/html';
$candidates[] = '/var/www/firefly-iii';
$candidates[] = '/app';
foreach ($candidates as $candidate) {
if (!is_dir($candidate)) {
continue;
}
if (file_exists($candidate . '/vendor/autoload.php')) {
return $candidate;
}
}
return '';
}
$email = trim((string) getenv('FIREFLY_USER_EMAIL'));
$username = trim((string) getenv('FIREFLY_USER_USERNAME'));
$password = (string) getenv('FIREFLY_USER_PASSWORD');
if (($email === '' && $username === '') || $password === '') {
error_line('missing FIREFLY_USER_EMAIL or FIREFLY_USER_USERNAME or FIREFLY_USER_PASSWORD');
exit(2);
}
$root = find_app_root();
if ($root === '') {
error_line('firefly app root not found');
exit(2);
}
$autoload = $root . '/vendor/autoload.php';
$app_bootstrap = $root . '/bootstrap/app.php';
if (!file_exists($autoload) || !file_exists($app_bootstrap)) {
error_line('firefly bootstrap files missing');
exit(2);
}
require $autoload;
$app = require $app_bootstrap;
$kernel = $app->make(ConsoleKernel::class);
$kernel->bootstrap();
try {
FireflyConfig::set('single_user_mode', true);
} catch (Throwable $exc) {
error_line('failed to enforce single_user_mode: ' . $exc->getMessage());
}
if ($email !== '') {
$query = User::where('email', $email);
} else {
$query = User::where('username', $username);
}
if ($email !== '' && $username !== '') {
$query = $query->orWhere('username', $username);
}
$existing_user = $query->first();
if (!$existing_user) {
error_line('firefly user missing');
exit(3);
}
if (Hash::check($password, $existing_user->password)) {
log_line('password match');
exit(0);
}
log_line('password mismatch');
exit(1);
"""
).strip()

View File

@ -107,6 +107,8 @@ sleep infinity
class ImageSweeperService:
"""Create Kubernetes cleanup jobs that prune stale node images."""
def _job_payload(self, job_name: str) -> dict[str, Any]:
job: dict[str, Any] = {
"apiVersion": "batch/v1",

View File

@ -0,0 +1,418 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
import threading
from typing import Any
import httpx
from prometheus_client import Counter, Gauge
from ..settings import settings
from ..utils.logging import get_logger
logger = get_logger(__name__)
JENKINS_BUILD_WEATHER_RUNS_TOTAL = Counter(
"ariadne_jenkins_build_weather_runs_total",
"Jenkins build weather collector runs by status",
["status"],
)
JENKINS_BUILD_WEATHER_LAST_RUN_TS = Gauge(
"ariadne_jenkins_build_weather_last_run_timestamp_seconds",
"Last Jenkins build weather collection timestamp",
)
JENKINS_BUILD_WEATHER_LAST_SUCCESS_TS = Gauge(
"ariadne_jenkins_build_weather_last_success_timestamp_seconds",
"Last successful Jenkins build weather collection timestamp",
)
JENKINS_BUILD_WEATHER_LAST_FAILURE_TS = Gauge(
"ariadne_jenkins_build_weather_last_failure_timestamp_seconds",
"Last failed Jenkins build weather collection timestamp",
)
JENKINS_BUILD_WEATHER_JOBS_TOTAL = Gauge(
"ariadne_jenkins_build_weather_jobs_total",
"Jenkins jobs observed in the latest weather collection",
["status"],
)
JENKINS_BUILD_WEATHER_JOB_LAST_RUN_TS = Gauge(
"ariadne_jenkins_build_weather_job_last_run_timestamp_seconds",
"Jenkins job last run timestamp",
["job", "job_url", "weather_icon"],
)
JENKINS_BUILD_WEATHER_JOB_LAST_SUCCESS_TS = Gauge(
"ariadne_jenkins_build_weather_job_last_success_timestamp_seconds",
"Jenkins job last success timestamp",
["job", "job_url", "weather_icon"],
)
JENKINS_BUILD_WEATHER_JOB_LAST_FAILURE_TS = Gauge(
"ariadne_jenkins_build_weather_job_last_failure_timestamp_seconds",
"Jenkins job last failure timestamp",
["job", "job_url", "weather_icon"],
)
JENKINS_BUILD_WEATHER_JOB_LAST_DURATION_SECONDS = Gauge(
"ariadne_jenkins_build_weather_job_last_duration_seconds",
"Jenkins job last build duration in seconds",
["job", "job_url", "weather_icon"],
)
JENKINS_BUILD_WEATHER_JOB_LAST_STATUS = Gauge(
"ariadne_jenkins_build_weather_job_last_status",
"Jenkins job last build status (1=success,0=failure,2=running,-1=unknown)",
["job", "job_url", "weather_icon"],
)
JENKINS_BUILD_WEATHER_JOB_HEALTH_SCORE = Gauge(
"ariadne_jenkins_build_weather_job_health_score",
"Jenkins job weather health score (0-100)",
["job", "job_url", "weather_icon"],
)
_JENKINS_JOBS_TREE = (
"jobs[name,url,color,healthReport[score],lastBuild[result,timestamp,duration],"
"lastSuccessfulBuild[timestamp],lastFailedBuild[timestamp],"
"jobs[name,url,color,healthReport[score],lastBuild[result,timestamp,duration],"
"lastSuccessfulBuild[timestamp],lastFailedBuild[timestamp]]]"
)
_STATUS_VALUES = {
"success": 1.0,
"failure": 0.0,
"running": 2.0,
"unknown": -1.0,
}
_JOB_SERIES: set[tuple[str, str, str]] = set()
_JOB_SERIES_LOCK = threading.Lock()
_JOB_METRICS = (
JENKINS_BUILD_WEATHER_JOB_LAST_RUN_TS,
JENKINS_BUILD_WEATHER_JOB_LAST_SUCCESS_TS,
JENKINS_BUILD_WEATHER_JOB_LAST_FAILURE_TS,
JENKINS_BUILD_WEATHER_JOB_LAST_DURATION_SECONDS,
JENKINS_BUILD_WEATHER_JOB_LAST_STATUS,
JENKINS_BUILD_WEATHER_JOB_HEALTH_SCORE,
)
_WEATHER_SUNNY_MIN_SCORE = 80
_WEATHER_PARTLY_CLOUDY_MIN_SCORE = 60
_WEATHER_CLOUDY_MIN_SCORE = 40
_WEATHER_RAINY_MIN_SCORE = 20
@dataclass(frozen=True)
class JenkinsBuildWeatherJob:
job: str
job_url: str
weather_icon: str
status: str
last_run_ts: float
last_success_ts: float
last_failure_ts: float
last_duration_seconds: float
health_score: float
@property
def series_key(self) -> tuple[str, str, str]:
return (self.job, self.job_url, self.weather_icon)
@dataclass(frozen=True)
class JenkinsBuildWeatherSummary:
jobs_total: int
success_total: int
failure_total: int
running_total: int
unknown_total: int
def _metric_number(value: Any) -> float:
if isinstance(value, bool):
return 0.0
if isinstance(value, (int, float)):
return float(value)
return 0.0
def _millis_to_seconds(value: Any) -> float:
raw = _metric_number(value)
if raw <= 0:
return 0.0
return raw / 1000.0
def _jenkins_auth() -> tuple[str, str] | None:
username = settings.jenkins_api_user.strip()
token = settings.jenkins_api_token.strip()
if username and token:
return (username, token)
return None
def _jenkins_status(job: dict[str, Any]) -> str:
last_build = job.get("lastBuild") if isinstance(job.get("lastBuild"), dict) else {}
result = str(last_build.get("result") or "").upper().strip()
color = str(job.get("color") or "").lower().strip()
if color.endswith("_anime"):
return "running"
if result == "SUCCESS":
return "success"
if result in {"FAILURE", "ABORTED", "UNSTABLE", "NOT_BUILT"}:
return "failure"
if color.startswith(("blue", "green")):
return "success"
if color.startswith(("red", "yellow")):
return "failure"
return "unknown"
def _health_score(job: dict[str, Any], status: str) -> float:
reports = job.get("healthReport")
if isinstance(reports, list):
for report in reports:
if not isinstance(report, dict):
continue
score = _metric_number(report.get("score"))
if score >= 0:
return max(0.0, min(score, 100.0))
if status == "success":
return 100.0
if status == "running":
return 60.0
if status == "failure":
return 10.0
return -1.0
def _weather_icon(score: float) -> str:
if score < 0:
return ""
if score >= _WEATHER_SUNNY_MIN_SCORE:
return "☀️"
if score >= _WEATHER_PARTLY_CLOUDY_MIN_SCORE:
return ""
if score >= _WEATHER_CLOUDY_MIN_SCORE:
return "☁️"
if score >= _WEATHER_RAINY_MIN_SCORE:
return "🌧️"
return "⛈️"
def _flatten_jobs(items: list[Any], prefix: str = "") -> list[dict[str, Any]]:
flattened: list[dict[str, Any]] = []
for item in items:
if not isinstance(item, dict):
continue
name = item.get("name")
if not isinstance(name, str) or not name.strip():
continue
full_name = f"{prefix}/{name}" if prefix else name
nested_jobs = item.get("jobs") if isinstance(item.get("jobs"), list) else []
if nested_jobs:
flattened.extend(_flatten_jobs(nested_jobs, prefix=full_name))
last_build = item.get("lastBuild")
if nested_jobs and not isinstance(last_build, dict):
continue
payload = dict(item)
payload["name"] = full_name
flattened.append(payload)
return flattened
def _parse_job(raw: dict[str, Any]) -> JenkinsBuildWeatherJob | None:
job = str(raw.get("name") or "").strip()
job_url = str(raw.get("url") or "").strip()
if not job or not job_url:
return None
status = _jenkins_status(raw)
score = _health_score(raw, status)
weather_icon = _weather_icon(score)
last_build = raw.get("lastBuild") if isinstance(raw.get("lastBuild"), dict) else {}
last_success = raw.get("lastSuccessfulBuild") if isinstance(raw.get("lastSuccessfulBuild"), dict) else {}
last_failure = raw.get("lastFailedBuild") if isinstance(raw.get("lastFailedBuild"), dict) else {}
return JenkinsBuildWeatherJob(
job=job,
job_url=job_url,
weather_icon=weather_icon,
status=status if status in _STATUS_VALUES else "unknown",
last_run_ts=_millis_to_seconds(last_build.get("timestamp")),
last_success_ts=_millis_to_seconds(last_success.get("timestamp")),
last_failure_ts=_millis_to_seconds(last_failure.get("timestamp")),
last_duration_seconds=_metric_number(last_build.get("duration")) / 1000.0,
health_score=score,
)
def _fetch_jobs() -> list[JenkinsBuildWeatherJob]:
base_url = settings.jenkins_base_url.strip().rstrip("/")
if not base_url:
return []
client_kwargs: dict[str, Any] = {
"timeout": settings.jenkins_api_timeout_sec,
"follow_redirects": True,
}
auth = _jenkins_auth()
if auth is not None:
client_kwargs["auth"] = auth
with httpx.Client(**client_kwargs) as client:
response = client.get(
f"{base_url}/api/json",
params={"tree": _JENKINS_JOBS_TREE},
)
response.raise_for_status()
payload = response.json()
if not isinstance(payload, dict):
raise ValueError("jenkins API returned a non-object payload")
items = payload.get("jobs") if isinstance(payload.get("jobs"), list) else []
jobs: list[JenkinsBuildWeatherJob] = []
for raw in _flatten_jobs(items):
parsed = _parse_job(raw)
if parsed is None:
continue
jobs.append(parsed)
jobs.sort(key=lambda row: row.last_run_ts, reverse=True)
return jobs
def _remove_missing_series(current_series: set[tuple[str, str, str]]) -> None:
global _JOB_SERIES
with _JOB_SERIES_LOCK:
removed = _JOB_SERIES - current_series
if removed:
for labels in removed:
for metric in _JOB_METRICS:
try:
metric.remove(*labels)
except KeyError:
pass
_JOB_SERIES = set(current_series)
def _record_jobs(jobs: list[JenkinsBuildWeatherJob]) -> JenkinsBuildWeatherSummary:
counts = {
"success": 0,
"failure": 0,
"running": 0,
"unknown": 0,
}
series: set[tuple[str, str, str]] = set()
for job in jobs:
series.add(job.series_key)
counts[job.status] = counts.get(job.status, 0) + 1
JENKINS_BUILD_WEATHER_JOB_LAST_RUN_TS.labels(
job=job.job,
job_url=job.job_url,
weather_icon=job.weather_icon,
).set(job.last_run_ts)
JENKINS_BUILD_WEATHER_JOB_LAST_SUCCESS_TS.labels(
job=job.job,
job_url=job.job_url,
weather_icon=job.weather_icon,
).set(job.last_success_ts)
JENKINS_BUILD_WEATHER_JOB_LAST_FAILURE_TS.labels(
job=job.job,
job_url=job.job_url,
weather_icon=job.weather_icon,
).set(job.last_failure_ts)
JENKINS_BUILD_WEATHER_JOB_LAST_DURATION_SECONDS.labels(
job=job.job,
job_url=job.job_url,
weather_icon=job.weather_icon,
).set(max(job.last_duration_seconds, 0.0))
JENKINS_BUILD_WEATHER_JOB_LAST_STATUS.labels(
job=job.job,
job_url=job.job_url,
weather_icon=job.weather_icon,
).set(_STATUS_VALUES.get(job.status, _STATUS_VALUES["unknown"]))
JENKINS_BUILD_WEATHER_JOB_HEALTH_SCORE.labels(
job=job.job,
job_url=job.job_url,
weather_icon=job.weather_icon,
).set(job.health_score)
_remove_missing_series(series)
for status in ("success", "failure", "running", "unknown"):
JENKINS_BUILD_WEATHER_JOBS_TOTAL.labels(status=status).set(counts.get(status, 0))
return JenkinsBuildWeatherSummary(
jobs_total=len(jobs),
success_total=counts.get("success", 0),
failure_total=counts.get("failure", 0),
running_total=counts.get("running", 0),
unknown_total=counts.get("unknown", 0),
)
def collect_jenkins_build_weather() -> JenkinsBuildWeatherSummary:
"""Collect Jenkins homepage job weather/status into Prometheus gauges."""
now_ts = datetime.now(timezone.utc).timestamp()
JENKINS_BUILD_WEATHER_LAST_RUN_TS.set(now_ts)
if not settings.jenkins_base_url.strip():
JENKINS_BUILD_WEATHER_RUNS_TOTAL.labels(status="skipped").inc()
summary = JenkinsBuildWeatherSummary(
jobs_total=0,
success_total=0,
failure_total=0,
running_total=0,
unknown_total=0,
)
logger.info(
"jenkins build weather skipped",
extra={
"event": "jenkins_build_weather",
"status": "skipped",
"detail": "jenkins base url is empty",
},
)
return summary
try:
jobs = _fetch_jobs()
summary = _record_jobs(jobs)
except Exception as exc:
JENKINS_BUILD_WEATHER_RUNS_TOTAL.labels(status="error").inc()
JENKINS_BUILD_WEATHER_LAST_FAILURE_TS.set(now_ts)
logger.exception(
"jenkins build weather collection failed",
extra={
"event": "jenkins_build_weather",
"status": "error",
"detail": str(exc),
},
)
raise
JENKINS_BUILD_WEATHER_RUNS_TOTAL.labels(status="ok").inc()
JENKINS_BUILD_WEATHER_LAST_SUCCESS_TS.set(now_ts)
logger.info(
"jenkins build weather collection finished",
extra={
"event": "jenkins_build_weather",
"status": "ok",
"jobs_total": summary.jobs_total,
"success_total": summary.success_total,
"failure_total": summary.failure_total,
"running_total": summary.running_total,
"unknown_total": summary.unknown_total,
},
)
return summary

View File

@ -0,0 +1,261 @@
from __future__ import annotations
from collections.abc import Callable
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Any
@dataclass(frozen=True)
class _CleanupCandidate:
name: str
kind: str
path: str
created_at: datetime | None
related_pvc: str | None = None
pv_name: str | None = None
@dataclass(frozen=True)
class _LonghornBinding:
pvc_name: Any
pvc_namespace: Any
referenced_pv_name: Any
def _parse_timestamp(raw: str) -> datetime | None:
"""Parse Kubernetes RFC3339 timestamps into timezone-aware datetimes."""
normalized = raw.replace("Z", "+00:00")
try:
return datetime.fromisoformat(normalized)
except ValueError:
return None
def _created_at(metadata: dict[str, Any]) -> datetime | None:
raw = metadata.get("creationTimestamp")
if not isinstance(raw, str) or not raw:
return None
return _parse_timestamp(raw)
def _is_old_enough(settings_obj: Any, metadata: dict[str, Any]) -> bool:
"""Return true when an object age exceeds the configured cleanup threshold."""
created_at = _created_at(metadata)
if created_at is None:
return False
min_age = timedelta(hours=settings_obj.jenkins_workspace_cleanup_min_age_hours)
return datetime.now(timezone.utc) - created_at >= min_age
def _is_deleting(metadata: dict[str, Any]) -> bool:
deletion_ts = metadata.get("deletionTimestamp")
return isinstance(deletion_ts, str) and bool(deletion_ts.strip())
def _is_workspace_name(settings_obj: Any, name: Any) -> bool:
return isinstance(name, str) and name.startswith(settings_obj.jenkins_workspace_pvc_prefix)
def _active_workspace_claims(settings_obj: Any, get_json_func: Callable[[str], dict[str, Any]]) -> set[str]:
"""Collect currently referenced Jenkins workspace PVC names from pods."""
namespace = settings_obj.jenkins_workspace_namespace
payload = get_json_func(f"/api/v1/namespaces/{namespace}/pods")
items = payload.get("items") if isinstance(payload.get("items"), list) else []
active: set[str] = set()
for pod in items:
if not isinstance(pod, dict):
continue
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
annotations = metadata.get("annotations") if isinstance(metadata.get("annotations"), dict) else {}
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
volumes = spec.get("volumes") if isinstance(spec.get("volumes"), list) else []
for volume in volumes:
if not isinstance(volume, dict):
continue
claim = volume.get("persistentVolumeClaim")
if not isinstance(claim, dict):
continue
claim_name = claim.get("claimName")
if _is_workspace_name(settings_obj, claim_name):
active.add(claim_name)
claim_name = annotations.get("jenkins.io/workspace-pvc")
if _is_workspace_name(settings_obj, claim_name):
active.add(claim_name)
return active
def _workspace_pv_candidates(settings_obj: Any, get_json_func: Callable[[str], dict[str, Any]], active_claims: set[str]) -> tuple[list[_CleanupCandidate], set[str]]:
"""Find releasable Jenkins workspace PVs and keep a set of all PV names."""
namespace = settings_obj.jenkins_workspace_namespace
payload = get_json_func("/api/v1/persistentvolumes")
items = payload.get("items") if isinstance(payload.get("items"), list) else []
candidates: list[_CleanupCandidate] = []
all_pv_names: set[str] = set()
for pv in items:
if not isinstance(pv, dict):
continue
metadata = pv.get("metadata") if isinstance(pv.get("metadata"), dict) else {}
status = pv.get("status") if isinstance(pv.get("status"), dict) else {}
spec = pv.get("spec") if isinstance(pv.get("spec"), dict) else {}
name = metadata.get("name")
if isinstance(name, str) and name:
all_pv_names.add(name)
claim_ref = spec.get("claimRef") if isinstance(spec.get("claimRef"), dict) else {}
claim_namespace = claim_ref.get("namespace")
claim_name = claim_ref.get("name")
phase = status.get("phase")
if claim_namespace != namespace:
continue
if not _is_workspace_name(settings_obj, claim_name):
continue
if _is_deleting(metadata):
continue
if claim_name in active_claims:
continue
if phase not in {"Released", "Failed"}:
continue
if not _is_old_enough(settings_obj, metadata):
continue
if not isinstance(name, str) or not name:
continue
candidates.append(
_CleanupCandidate(
name=name,
kind="pv",
path=f"/api/v1/persistentvolumes/{name}",
created_at=_created_at(metadata),
related_pvc=claim_name if isinstance(claim_name, str) else None,
)
)
return candidates, all_pv_names
def _workspace_pvc_candidates(settings_obj: Any, get_json_func: Callable[[str], dict[str, Any]], active_claims: set[str]) -> list[_CleanupCandidate]:
"""Find stale Jenkins workspace PVCs that are not actively referenced."""
namespace = settings_obj.jenkins_workspace_namespace
payload = get_json_func(f"/api/v1/namespaces/{namespace}/persistentvolumeclaims")
items = payload.get("items") if isinstance(payload.get("items"), list) else []
candidates: list[_CleanupCandidate] = []
for pvc in items:
if not isinstance(pvc, dict):
continue
metadata = pvc.get("metadata") if isinstance(pvc.get("metadata"), dict) else {}
status = pvc.get("status") if isinstance(pvc.get("status"), dict) else {}
claim_name = metadata.get("name")
phase = status.get("phase")
if not _is_workspace_name(settings_obj, claim_name):
continue
if _is_deleting(metadata):
continue
if claim_name in active_claims:
continue
if phase == "Bound":
continue
if not _is_old_enough(settings_obj, metadata):
continue
if not isinstance(claim_name, str) or not claim_name:
continue
candidates.append(
_CleanupCandidate(
name=claim_name,
kind="pvc",
path=f"/api/v1/namespaces/{namespace}/persistentvolumeclaims/{claim_name}",
created_at=_created_at(metadata),
)
)
return candidates
def _workspace_binding_from_longhorn(metadata: dict[str, Any], status: dict[str, Any]) -> _LonghornBinding:
labels = metadata.get("labels") if isinstance(metadata.get("labels"), dict) else {}
kubernetes_status = status.get("kubernetesStatus") if isinstance(status.get("kubernetesStatus"), dict) else {}
pvc_name = labels.get("kubernetes.io/created-for/pvc/name")
if not isinstance(pvc_name, str) or not pvc_name:
pvc_name = kubernetes_status.get("pvcName")
pvc_namespace = labels.get("kubernetes.io/created-for/pvc/namespace")
if not isinstance(pvc_namespace, str) or not pvc_namespace:
pvc_namespace = kubernetes_status.get("namespace")
referenced_pv_name = kubernetes_status.get("pvName")
return _LonghornBinding(
pvc_name=pvc_name,
pvc_namespace=pvc_namespace,
referenced_pv_name=referenced_pv_name,
)
def _should_delete_longhorn_volume(settings_obj: Any, name: str, binding: _LonghornBinding, all_pv_names: set[str], removed_pv_names: set[str]) -> bool:
if name in removed_pv_names or binding.referenced_pv_name in removed_pv_names:
return True
if not _is_workspace_name(settings_obj, binding.pvc_name):
return False
if (
isinstance(binding.referenced_pv_name, str)
and binding.referenced_pv_name in all_pv_names
) or name in all_pv_names:
return False
return (
binding.pvc_namespace in {None, ""}
or binding.pvc_namespace == settings_obj.jenkins_workspace_namespace
)
def _workspace_longhorn_candidates(settings_obj: Any, get_json_func: Callable[[str], dict[str, Any]], all_pv_names: set[str], removed_pv_names: set[str]) -> list[_CleanupCandidate]:
namespace = "longhorn-system"
payload = get_json_func("/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes")
items = payload.get("items") if isinstance(payload.get("items"), list) else []
candidates: list[_CleanupCandidate] = []
for volume in items:
if not isinstance(volume, dict):
continue
metadata = volume.get("metadata") if isinstance(volume.get("metadata"), dict) else {}
status = volume.get("status") if isinstance(volume.get("status"), dict) else {}
spec = volume.get("spec") if isinstance(volume.get("spec"), dict) else {}
name = metadata.get("name")
if not isinstance(name, str) or not name:
continue
binding = _workspace_binding_from_longhorn(metadata, status)
robust_state = status.get("robustness")
state = status.get("state")
attached = status.get("isAttached")
frontend = spec.get("frontend")
if not _should_delete_longhorn_volume(
settings_obj,
name,
binding,
all_pv_names,
removed_pv_names,
):
continue
if _is_deleting(metadata):
continue
if not _is_old_enough(settings_obj, metadata):
continue
if state not in {None, "detached", "faulted", "unknown"}:
continue
if attached is True:
continue
if robust_state not in {None, "unknown", "faulted", "degraded"}:
continue
if frontend not in {None, "", "blockdev"}:
continue
candidates.append(
_CleanupCandidate(
name=name,
kind="longhorn_volume",
path=f"/apis/longhorn.io/v1beta2/namespaces/{namespace}/volumes/{name}",
created_at=_created_at(metadata),
pv_name=name,
)
)
return candidates

View File

@ -0,0 +1,359 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from prometheus_client import Counter, Gauge
from ..k8s.client import delete_json, get_json
from ..settings import settings
from ..utils.logging import get_logger
from .jenkins_workspace_candidates import (
_CleanupCandidate,
_active_workspace_claims,
_workspace_longhorn_candidates,
_workspace_pv_candidates,
_workspace_pvc_candidates,
)
logger = get_logger(__name__)
JENKINS_WORKSPACE_CLEANUP_RUNS_TOTAL = Counter(
"ariadne_jenkins_workspace_cleanup_runs_total",
"Jenkins workspace cleanup runs by status and mode",
["status", "mode"],
)
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL = Counter(
"ariadne_jenkins_workspace_cleanup_objects_total",
"Jenkins workspace cleanup objects by kind, action, and mode",
["kind", "action", "mode"],
)
JENKINS_WORKSPACE_CLEANUP_LAST_RUN_TS = Gauge(
"ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds",
"Last Jenkins workspace cleanup run timestamp",
)
JENKINS_WORKSPACE_CLEANUP_LAST_SUCCESS_TS = Gauge(
"ariadne_jenkins_workspace_cleanup_last_success_timestamp_seconds",
"Last successful Jenkins workspace cleanup timestamp",
)
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURE_TS = Gauge(
"ariadne_jenkins_workspace_cleanup_last_failure_timestamp_seconds",
"Last failed Jenkins workspace cleanup timestamp",
)
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED = Gauge(
"ariadne_jenkins_workspace_cleanup_last_deleted_total",
"Last Jenkins workspace cleanup deleted object count",
["kind"],
)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED = Gauge(
"ariadne_jenkins_workspace_cleanup_last_planned_total",
"Last Jenkins workspace cleanup planned object count",
["kind"],
)
JENKINS_WORKSPACE_CLEANUP_LAST_SKIPPED = Gauge(
"ariadne_jenkins_workspace_cleanup_last_skipped_total",
"Last Jenkins workspace cleanup skipped object count",
)
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURES = Gauge(
"ariadne_jenkins_workspace_cleanup_last_failures_total",
"Last Jenkins workspace cleanup failure count",
)
@dataclass(frozen=True)
class JenkinsWorkspaceCleanupSummary:
"""Summarize one Jenkins workspace-storage cleanup pass.
Inputs: Kubernetes PV/PVC/Longhorn objects fetched from the API server.
Outputs: deterministic counters for operator logs and metrics.
"""
pvs_planned: int
pvcs_planned: int
volumes_planned: int
pvs_deleted: int
pvcs_deleted: int
volumes_deleted: int
skipped: int
failures: int
dry_run: bool
@property
def planned(self) -> int:
return self.pvs_planned + self.pvcs_planned + self.volumes_planned
@property
def deleted(self) -> int:
return self.pvs_deleted + self.pvcs_deleted + self.volumes_deleted
def _validate_cleanup_settings() -> tuple[str, str, bool, int]:
namespace = settings.jenkins_workspace_namespace
prefix = settings.jenkins_workspace_pvc_prefix.strip()
dry_run = settings.jenkins_workspace_cleanup_dry_run
max_deletions = settings.jenkins_workspace_cleanup_max_deletions_per_run
if not namespace.strip():
raise ValueError("jenkins workspace cleanup namespace is empty")
if not prefix:
raise ValueError("jenkins workspace cleanup pvc prefix is empty")
if settings.jenkins_workspace_cleanup_min_age_hours < 1.0:
raise ValueError("jenkins workspace cleanup min age must be >= 1 hour")
if max_deletions < 1:
raise ValueError("jenkins workspace cleanup max deletions must be >= 1")
return namespace, prefix, dry_run, max_deletions
def _planned_removed_pv_names_dry_run(stale_pvcs: list[_CleanupCandidate], stale_pvs: list[_CleanupCandidate], max_deletions: int) -> set[str]:
remaining = max(max_deletions - len(stale_pvcs), 0)
if remaining == 0:
return set()
names = [candidate.name for candidate in stale_pvs if candidate.name]
return set(names[:remaining])
def _delete_candidates(candidates: list[_CleanupCandidate], *, deletion_budget: int | None, failure_log: str, failure_field: str, removed_pv_names: set[str] | None = None) -> tuple[int, int, int, int | None]:
deleted = 0
skipped = 0
failures = 0
budget = deletion_budget
for candidate in candidates:
if not candidate.name:
skipped += 1
continue
if budget is not None and budget <= 0:
skipped += 1
continue
if budget is not None:
budget -= 1
try:
delete_json(candidate.path)
deleted += 1
if removed_pv_names is not None:
removed_pv_names.add(candidate.name)
except Exception as exc:
failures += 1
logger.info(
failure_log,
extra={"event": "jenkins_workspace_cleanup", failure_field: candidate.name, "detail": str(exc)},
)
return deleted, skipped, failures, budget
def _record_guard_cap(*, max_deletions: int, stale_pvcs: list[_CleanupCandidate], stale_pvs: list[_CleanupCandidate], stale_volumes: list[_CleanupCandidate], dry_run: bool) -> None:
planned_total = len(stale_pvcs) + len(stale_pvs) + len(stale_volumes)
if planned_total <= max_deletions:
return
logger.warning(
"jenkins workspace cleanup capped by max deletions guard",
extra={
"event": "jenkins_workspace_cleanup",
"status": "guard_capped",
"namespace": settings.jenkins_workspace_namespace,
"dry_run": dry_run,
"planned_total": planned_total,
"max_deletions": max_deletions,
"planned_pvs": len(stale_pvs),
"planned_pvcs": len(stale_pvcs),
"planned_volumes": len(stale_volumes),
},
)
def _dry_run_summary(*, namespace: str, max_deletions: int, stale_pvcs: list[_CleanupCandidate], stale_pvs: list[_CleanupCandidate], all_pv_names: set[str]) -> JenkinsWorkspaceCleanupSummary:
simulated_removed = _planned_removed_pv_names_dry_run(stale_pvcs, stale_pvs, max_deletions)
stale_volumes = _workspace_longhorn_candidates(settings, get_json, all_pv_names, simulated_removed)
_record_guard_cap(
max_deletions=max_deletions,
stale_pvcs=stale_pvcs,
stale_pvs=stale_pvs,
stale_volumes=stale_volumes,
dry_run=True,
)
logger.info(
"jenkins workspace cleanup dry-run enabled",
extra={
"event": "jenkins_workspace_cleanup",
"status": "dry_run",
"namespace": namespace,
"dry_run": True,
"planned_pvs": len(stale_pvs),
"planned_pvcs": len(stale_pvcs),
"planned_volumes": len(stale_volumes),
"max_deletions": max_deletions,
},
)
return JenkinsWorkspaceCleanupSummary(
pvs_planned=len(stale_pvs),
pvcs_planned=len(stale_pvcs),
volumes_planned=len(stale_volumes),
pvs_deleted=0,
pvcs_deleted=0,
volumes_deleted=0,
skipped=0,
failures=0,
dry_run=True,
)
def _delete_run_summary(*, namespace: str, max_deletions: int, stale_pvcs: list[_CleanupCandidate], stale_pvs: list[_CleanupCandidate], all_pv_names: set[str]) -> JenkinsWorkspaceCleanupSummary:
removed_pv_names: set[str] = set()
deletion_budget: int | None = max_deletions
pvcs_deleted, pvc_skipped, pvc_failures, deletion_budget = _delete_candidates(
stale_pvcs,
deletion_budget=deletion_budget,
failure_log="jenkins workspace pvc delete failed",
failure_field="claim",
)
pvs_deleted, pv_skipped, pv_failures, deletion_budget = _delete_candidates(
stale_pvs,
deletion_budget=deletion_budget,
failure_log="jenkins workspace pv delete failed",
failure_field="pv",
removed_pv_names=removed_pv_names,
)
stale_volumes = _workspace_longhorn_candidates(settings, get_json, all_pv_names, removed_pv_names)
_record_guard_cap(
max_deletions=max_deletions,
stale_pvcs=stale_pvcs,
stale_pvs=stale_pvs,
stale_volumes=stale_volumes,
dry_run=False,
)
volumes_deleted, volume_skipped, volume_failures, _ = _delete_candidates(
stale_volumes,
deletion_budget=deletion_budget,
failure_log="jenkins workspace longhorn volume delete failed",
failure_field="volume",
)
return JenkinsWorkspaceCleanupSummary(
pvs_planned=len(stale_pvs),
pvcs_planned=len(stale_pvcs),
volumes_planned=len(stale_volumes),
pvs_deleted=pvs_deleted,
pvcs_deleted=pvcs_deleted,
volumes_deleted=volumes_deleted,
skipped=pvc_skipped + pv_skipped + volume_skipped,
failures=pvc_failures + pv_failures + volume_failures,
dry_run=False,
)
def _record_metrics(summary: JenkinsWorkspaceCleanupSummary) -> None:
mode = "dry_run" if summary.dry_run else "delete"
status = "ok" if summary.failures == 0 else "error"
JENKINS_WORKSPACE_CLEANUP_RUNS_TOTAL.labels(status=status, mode=mode).inc()
if summary.failures:
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURE_TS.set(datetime.now(timezone.utc).timestamp())
else:
JENKINS_WORKSPACE_CLEANUP_LAST_SUCCESS_TS.set(datetime.now(timezone.utc).timestamp())
JENKINS_WORKSPACE_CLEANUP_LAST_RUN_TS.set(datetime.now(timezone.utc).timestamp())
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED.labels(kind="pvc").set(summary.pvcs_deleted)
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED.labels(kind="pv").set(summary.pvs_deleted)
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED.labels(kind="longhorn_volume").set(summary.volumes_deleted)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED.labels(kind="pvc").set(summary.pvcs_planned)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED.labels(kind="pv").set(summary.pvs_planned)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED.labels(kind="longhorn_volume").set(summary.volumes_planned)
JENKINS_WORKSPACE_CLEANUP_LAST_SKIPPED.set(summary.skipped)
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURES.set(summary.failures)
for kind, planned, deleted in (
("pvc", summary.pvcs_planned, summary.pvcs_deleted),
("pv", summary.pvs_planned, summary.pvs_deleted),
("longhorn_volume", summary.volumes_planned, summary.volumes_deleted),
):
if planned:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(kind=kind, action="planned", mode=mode).inc(planned)
if deleted:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(kind=kind, action="deleted", mode=mode).inc(deleted)
if summary.skipped:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(
kind="cleanup",
action="skipped",
mode=mode,
).inc(summary.skipped)
if summary.failures:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(
kind="cleanup",
action="failed",
mode=mode,
).inc(summary.failures)
def cleanup_jenkins_workspace_storage() -> JenkinsWorkspaceCleanupSummary:
"""Delete stale Jenkins workspace PVC/PV artifacts and orphan Longhorn volumes."""
summary = JenkinsWorkspaceCleanupSummary(
pvs_planned=0,
pvcs_planned=0,
volumes_planned=0,
pvs_deleted=0,
pvcs_deleted=0,
volumes_deleted=0,
skipped=0,
failures=0,
dry_run=settings.jenkins_workspace_cleanup_dry_run,
)
try:
namespace, _prefix, dry_run, max_deletions = _validate_cleanup_settings()
active_claims = _active_workspace_claims(settings, get_json)
stale_pvs, all_pv_names = _workspace_pv_candidates(settings, get_json, active_claims)
stale_pvcs = _workspace_pvc_candidates(settings, get_json, active_claims)
if dry_run:
summary = _dry_run_summary(
namespace=namespace,
max_deletions=max_deletions,
stale_pvcs=stale_pvcs,
stale_pvs=stale_pvs,
all_pv_names=all_pv_names,
)
else:
summary = _delete_run_summary(
namespace=namespace,
max_deletions=max_deletions,
stale_pvcs=stale_pvcs,
stale_pvs=stale_pvs,
all_pv_names=all_pv_names,
)
except Exception as exc:
logger.exception(
"jenkins workspace cleanup failed",
extra={
"event": "jenkins_workspace_cleanup",
"status": "error",
"namespace": settings.jenkins_workspace_namespace,
"detail": str(exc),
},
)
summary = JenkinsWorkspaceCleanupSummary(
pvs_planned=summary.pvs_planned,
pvcs_planned=summary.pvcs_planned,
volumes_planned=summary.volumes_planned,
pvs_deleted=summary.pvs_deleted,
pvcs_deleted=summary.pvcs_deleted,
volumes_deleted=summary.volumes_deleted,
skipped=summary.skipped,
failures=summary.failures + 1,
dry_run=summary.dry_run,
)
_record_metrics(summary)
raise
_record_metrics(summary)
logger.info(
"jenkins workspace cleanup finished",
extra={
"event": "jenkins_workspace_cleanup",
"status": "ok" if summary.failures == 0 else "error",
"dry_run": summary.dry_run,
"namespace": namespace,
"planned_pvs": summary.pvs_planned,
"planned_pvcs": summary.pvcs_planned,
"planned_volumes": summary.volumes_planned,
"deleted_pvs": summary.pvs_deleted,
"deleted_pvcs": summary.pvcs_deleted,
"deleted_volumes": summary.volumes_deleted,
"skipped": summary.skipped,
"failures": summary.failures,
},
)
return summary

View File

@ -9,6 +9,8 @@ from ..settings import settings
class KeycloakAdminClient:
"""Call the Keycloak admin API for user, group, and attribute updates."""
def __init__(self) -> None:
self._token: str = ""
self._expires_at: float = 0.0

View File

@ -29,6 +29,8 @@ def _profile_complete(user: dict[str, Any]) -> bool:
def run_profile_sync() -> ProfileSyncSummary:
"""Clear completed Keycloak profile actions once required fields exist."""
if not keycloak_admin.ready():
summary = ProfileSyncSummary(0, 0, 0, 1, detail="keycloak admin not configured")
logger.info(

View File

@ -19,6 +19,8 @@ class SentEmail:
class Mailer:
"""Send onboarding and notification email through configured SMTP."""
def __init__(self) -> None:
self._host = settings.smtp_host
self._port = settings.smtp_port

View File

@ -115,6 +115,8 @@ def _password_too_long(password: str) -> bool:
class MailuService:
"""Synchronize Keycloak user mail settings into Mailu storage."""
def __init__(self) -> None:
self._db_config = {
"host": settings.mailu_db_host,
@ -136,11 +138,7 @@ class MailuService:
)
@staticmethod
def resolve_mailu_email(
username: str,
attributes: dict[str, Any] | None,
fallback_email: str = "",
) -> str:
def resolve_mailu_email(username: str, attributes: dict[str, Any] | None, fallback_email: str = "") -> str:
attrs = attributes or {}
explicit = _extract_attr(attrs, MAILU_EMAIL_ATTR)
if explicit:
@ -180,12 +178,7 @@ class MailuService:
},
)
def _prepare_updates(
self,
username: str,
attrs: dict[str, Any],
mailu_email: str,
) -> tuple[bool, dict[str, list[str]], str]:
def _prepare_updates(self, username: str, attrs: dict[str, Any], mailu_email: str) -> tuple[bool, dict[str, list[str]], str]:
updates: dict[str, list[str]] = {}
if not _extract_attr(attrs, MAILU_EMAIL_ATTR):
updates[MAILU_EMAIL_ATTR] = [mailu_email]
@ -226,10 +219,7 @@ class MailuService:
return True
return self._is_service_account(user, username)
def _build_sync_context(
self,
user: dict[str, Any],
) -> tuple[MailuSyncContext | None, MailuUserSyncResult | None]:
def _build_sync_context(self, user: dict[str, Any]) -> tuple[MailuSyncContext | None, MailuUserSyncResult | None]:
username = self._username(user)
if self._should_skip_user(user, username):
return None, MailuUserSyncResult(skipped=1)
@ -268,11 +258,7 @@ class MailuService:
None,
)
def _ensure_mailbox_with_retry(
self,
conn: psycopg.Connection,
ctx: MailuSyncContext,
) -> tuple[bool, bool, bool]:
def _ensure_mailbox_with_retry(self, conn: psycopg.Connection, ctx: MailuSyncContext) -> tuple[bool, bool, bool]:
mailbox_ok = False
rotated = False
failed = False
@ -303,12 +289,7 @@ class MailuService:
return mailbox_ok, failed, rotated
@staticmethod
def _build_sync_result(
updated: int,
mailbox_ok: bool,
failed: bool,
rotated: bool,
) -> MailuUserSyncResult:
def _build_sync_result(updated: int, mailbox_ok: bool, failed: bool, rotated: bool) -> MailuUserSyncResult:
if failed:
return MailuUserSyncResult(failures=1, updated=updated)
if mailbox_ok:
@ -324,13 +305,7 @@ class MailuService:
mailbox_ok, failed, rotated = self._ensure_mailbox_with_retry(conn, ctx)
return self._build_sync_result(ctx.updated, mailbox_ok, failed, rotated)
def _ensure_mailbox(
self,
conn: psycopg.Connection,
email: str,
password: str,
display_name: str,
) -> bool:
def _ensure_mailbox(self, conn: psycopg.Connection, email: str, password: str, display_name: str) -> bool:
email = (email or "").strip()
if not email or "@" not in email:
return False

View File

@ -54,13 +54,9 @@ def _event_context(payload: dict[str, Any] | None) -> dict[str, Any]:
class MailuEventRunner:
def __init__(
self,
min_interval_sec: float,
wait_timeout_sec: float,
runner: Callable[[str, bool], tuple[str, str]] | None = None,
thread_factory: Callable[..., threading.Thread] = threading.Thread,
) -> None:
"""Debounce Keycloak events into Mailu synchronization runs."""
def __init__(self, min_interval_sec: float, wait_timeout_sec: float, runner: Callable[[str, bool], tuple[str, str]] | None = None, thread_factory: Callable[..., threading.Thread] = threading.Thread) -> None:
self._min_interval_sec = min_interval_sec
self._wait_timeout_sec = wait_timeout_sec
self._runner = runner or self._default_runner

View File

@ -39,6 +39,8 @@ def _normalize_payload(payload: Any) -> dict[str, Any]:
class MetisService:
"""Trigger Metis sentinel watch runs and normalize their response."""
def ready(self) -> bool:
return bool(_watch_url())

View File

@ -1,8 +1,6 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
import re
import time
from typing import Any
@ -15,97 +13,20 @@ from ..settings import settings
from ..utils.logging import get_logger
from ..utils.passwords import random_password
from .keycloak_admin import keycloak_admin
from .nextcloud_maintenance import run_maintenance as run_nextcloud_maintenance
from .nextcloud_mail_models import MailSyncCounters
from .nextcloud_mail_models import display_name as _display_name
from .nextcloud_mail_models import _extract_attr
from .nextcloud_mail_models import _parse_mail_export
from .nextcloud_mail_models import _resolve_mailu_email
logger = get_logger(__name__)
def _extract_attr(attrs: Any, key: str) -> str:
if not isinstance(attrs, dict):
return ""
raw = attrs.get(key)
if isinstance(raw, list):
for item in raw:
if isinstance(item, str) and item.strip():
return item.strip()
return ""
if isinstance(raw, str) and raw.strip():
return raw.strip()
return ""
def _resolve_mailu_email(username: str, user: dict[str, Any]) -> str:
attrs = user.get("attributes")
mailu_email = _extract_attr(attrs, "mailu_email")
if mailu_email:
return mailu_email
email = user.get("email")
if isinstance(email, str) and email.strip():
email = email.strip()
if email.lower().endswith(f"@{settings.mailu_domain.lower()}"):
return email
return f"{username}@{settings.mailu_domain}"
def _parse_mail_export(output: str) -> list[tuple[str, str]]:
accounts: list[tuple[str, str]] = []
account_id = ""
for line in output.splitlines():
line = line.strip()
if not line:
continue
match = re.match(r"^Account\s+(\d+):", line, flags=re.IGNORECASE)
if match:
account_id = match.group(1)
continue
match = re.match(r"^-\s*E-?mail:\s*(\S+)", line, flags=re.IGNORECASE)
if match and account_id:
accounts.append((account_id, match.group(1)))
return accounts
@dataclass(frozen=True)
class NextcloudMailSyncSummary:
processed: int
created: int
updated: int
deleted: int
skipped: int
failures: int
detail: str = ""
@dataclass
class MailSyncCounters:
processed: int = 0
created: int = 0
updated: int = 0
deleted: int = 0
skipped: int = 0
failures: int = 0
last_error: str = ""
def summary(self) -> NextcloudMailSyncSummary:
return NextcloudMailSyncSummary(
processed=self.processed,
created=self.created,
updated=self.updated,
deleted=self.deleted,
skipped=self.skipped,
failures=self.failures,
detail=self.last_error,
)
def status(self) -> str:
return "ok" if self.failures == 0 else "error"
def record_failure(self, detail: str) -> None:
self.failures += 1
if detail and not self.last_error:
self.last_error = detail
class NextcloudService:
"""Synchronize user mail configuration inside the Nextcloud pod."""
def __init__(self) -> None:
self._executor = PodExecutor(
settings.nextcloud_namespace,
@ -113,13 +34,7 @@ class NextcloudService:
settings.nextcloud_container,
)
def _exec_with_fallback(
self,
primary: list[str],
fallback: list[str],
env: dict[str, str] | None = None,
check: bool = True,
) -> ExecResult:
def _exec_with_fallback(self, primary: list[str], fallback: list[str], env: dict[str, str] | None = None, check: bool = True) -> ExecResult:
try:
result = self._executor.exec(
primary,
@ -146,12 +61,7 @@ class NextcloudService:
)
return result
def _occ_exec(
self,
args: list[str],
env: dict[str, str] | None = None,
check: bool = True,
) -> ExecResult:
def _occ_exec(self, args: list[str], env: dict[str, str] | None = None, check: bool = True) -> ExecResult:
command = ["runuser", "-u", "www-data", "--", "php", "/var/www/html/occ", *args]
fallback = ["php", "/var/www/html/occ", *args]
return self._exec_with_fallback(command, fallback, env=env, check=check)
@ -160,21 +70,7 @@ class NextcloudService:
result = self._occ_exec(args, check=True)
return result.stdout
def _display_name(self, user: dict[str, Any]) -> str:
first = user.get("firstName") if isinstance(user.get("firstName"), str) else ""
last = user.get("lastName") if isinstance(user.get("lastName"), str) else ""
first = first.strip()
last = last.strip()
if first and last:
return f"{first} {last}"
return last or first
def _ensure_nextcloud_user(
self,
username: str,
mailu_email: str,
display_name: str,
) -> None:
def _ensure_nextcloud_user(self, username: str, mailu_email: str, display_name: str) -> None:
result = self._occ_exec(["user:info", username], check=False)
if result.ok:
return
@ -279,11 +175,7 @@ class NextcloudService:
full_user = user
return username_val, user_id, full_user
def _list_mail_accounts_safe(
self,
username: str,
counters: MailSyncCounters,
) -> list[tuple[str, str]] | None:
def _list_mail_accounts_safe(self, username: str, counters: MailSyncCounters) -> list[tuple[str, str]] | None:
try:
return self._list_mail_accounts(username)
except Exception as exc:
@ -295,11 +187,7 @@ class NextcloudService:
)
return None
def _select_primary_account(
self,
mailu_accounts: list[tuple[str, str]],
mailu_email: str,
) -> tuple[str, str]:
def _select_primary_account(self, mailu_accounts: list[tuple[str, str]], mailu_email: str) -> tuple[str, str]:
primary_id = ""
primary_email = ""
for account_id, account_email in mailu_accounts:
@ -312,13 +200,7 @@ class NextcloudService:
break
return primary_id, primary_email
def _update_mail_account(
self,
username: str,
primary_id: str,
mailu_email: str,
app_pw: str,
) -> str | None:
def _update_mail_account(self, username: str, primary_id: str, mailu_email: str, app_pw: str) -> str | None:
try:
self._occ(
[
@ -383,12 +265,7 @@ class NextcloudService:
except Exception as exc:
return str(exc)
def _delete_extra_accounts(
self,
mailu_accounts: list[tuple[str, str]],
primary_id: str,
counters: MailSyncCounters,
) -> int:
def _delete_extra_accounts(self, mailu_accounts: list[tuple[str, str]], primary_id: str, counters: MailSyncCounters) -> int:
deleted = 0
for account_id, _account_email in mailu_accounts:
if account_id == primary_id:
@ -407,11 +284,7 @@ class NextcloudService:
if email.lower().endswith(f"@{settings.mailu_domain.lower()}")
]
def _summarize_mail_accounts(
self,
accounts: list[tuple[str, str]],
mailu_email: str,
) -> tuple[int, str, list[str]]:
def _summarize_mail_accounts(self, accounts: list[tuple[str, str]], mailu_email: str) -> tuple[int, str, list[str]]:
mailu_accounts = self._mailu_accounts(accounts)
account_count = len(mailu_accounts)
primary_email = ""
@ -425,11 +298,7 @@ class NextcloudService:
primary_email = account_email
return account_count, primary_email, editor_mode_ids
def _mail_sync_context(
self,
user: dict[str, Any],
counters: MailSyncCounters,
) -> tuple[str, str, str, str, dict[str, Any]] | None:
def _mail_sync_context(self, user: dict[str, Any], counters: MailSyncCounters) -> tuple[str, str, str, str, dict[str, Any]] | None:
normalized = self._normalize_user(user)
if not normalized:
counters.skipped += 1
@ -448,14 +317,7 @@ class NextcloudService:
pass
return username, user_id, mailu_email, app_pw, full_user
def _sync_mail_accounts(
self,
username: str,
mailu_email: str,
app_pw: str,
accounts: list[tuple[str, str]],
counters: MailSyncCounters,
) -> bool:
def _sync_mail_accounts(self, username: str, mailu_email: str, app_pw: str, accounts: list[tuple[str, str]], counters: MailSyncCounters) -> bool:
mailu_accounts = self._mailu_accounts(accounts)
if mailu_accounts:
primary_id, _primary_email = self._select_primary_account(mailu_accounts, mailu_email)
@ -473,12 +335,7 @@ class NextcloudService:
counters.created += 1
return True
def _apply_mail_metadata(
self,
user_id: str,
mailu_email: str,
accounts: list[tuple[str, str]],
) -> None:
def _apply_mail_metadata(self, user_id: str, mailu_email: str, accounts: list[tuple[str, str]]) -> None:
account_count, primary_email, editor_mode_ids = self._summarize_mail_accounts(accounts, mailu_email)
self._set_editor_mode_richtext(editor_mode_ids)
if user_id:
@ -491,7 +348,7 @@ class NextcloudService:
username, user_id, mailu_email, app_pw, full_user = context
try:
display_name = self._display_name(full_user)
display_name = _display_name(full_user)
self._ensure_nextcloud_user(username, mailu_email, display_name)
except Exception as exc:
counters.record_failure(f"nextcloud user ensure failed: {exc}")
@ -558,13 +415,6 @@ class NextcloudService:
return {"status": counters.status(), "summary": summary_payload, "detail": summary.detail}
def _run_shell(self, script: str, check: bool = True) -> None:
self._executor.exec(
script,
timeout_sec=settings.nextcloud_exec_timeout_sec,
check=check,
)
def _external_api(self, method: str, path: str, data: dict[str, Any] | None = None) -> dict[str, Any]:
if not settings.nextcloud_url:
raise RuntimeError("nextcloud url not configured")
@ -587,113 +437,7 @@ class NextcloudService:
return {}
def run_maintenance(self) -> dict[str, Any]:
if not settings.nextcloud_namespace:
raise RuntimeError("nextcloud maintenance not configured")
try:
self._run_shell(
"""
set -euo pipefail
if [ ! -d /var/www/html/lib ] && [ -d /usr/src/nextcloud/lib ]; then
if command -v rsync >/dev/null 2>&1; then
rsync -a --delete --exclude config --exclude data /usr/src/nextcloud/ /var/www/html/
else
cp -a /usr/src/nextcloud/. /var/www/html/
fi
fi
mkdir -p /var/www/html/data
chown 33:33 /var/www/html || true
chmod 775 /var/www/html || true
chown -R 33:33 /var/www/html/apps /var/www/html/custom_apps /var/www/html/data /var/www/html/config 2>/dev/null || true
""",
check=False,
)
self._occ(["config:app:set", "theming", "name", "--value", "Atlas Cloud"])
self._occ(["config:app:set", "theming", "slogan", "--value", "Unified access to Atlas services"])
theming_url = settings.nextcloud_url or "https://cloud.bstein.dev"
self._occ(["config:app:set", "theming", "url", "--value", theming_url])
self._occ(["config:app:set", "theming", "color", "--value", "#0f172a"])
self._occ(["config:app:set", "theming", "disable-user-theming", "--value", "yes"])
self._executor.exec(
["runuser", "-u", "www-data", "--", "php", "/var/www/html/occ", "app:install", "customcss"],
timeout_sec=settings.nextcloud_exec_timeout_sec,
check=False,
)
self._executor.exec(
["runuser", "-u", "www-data", "--", "php", "/var/www/html/occ", "app:enable", "customcss"],
timeout_sec=settings.nextcloud_exec_timeout_sec,
check=False,
)
mail_css = (
".mail-message-body, .mail-message-body pre, .mail-message-body code, .mail-message-body table {\n"
" font-family: \"Inter\", \"Source Sans 3\", \"Helvetica Neue\", Arial, sans-serif;\n"
" font-size: 14px;\n"
" line-height: 1.6;\n"
" color: var(--color-main-text);\n"
"}\n"
".mail-message-body pre {\n"
" background: rgba(15, 23, 42, 0.06);\n"
" padding: 12px;\n"
" border-radius: 8px;\n"
"}\n"
".mail-message-body blockquote {\n"
" border-left: 3px solid var(--color-border);\n"
" padding-left: 12px;\n"
" margin: 8px 0;\n"
" color: var(--color-text-lighter);\n"
"}\n"
".mail-message-body img {\n"
" max-width: 100%;\n"
" border-radius: 6px;\n"
"}\n"
)
self._occ(["config:app:set", "customcss", "css", "--value", mail_css])
self._occ(["config:app:set", "files", "default_quota", "--value", "250 GB"])
payload = self._external_api("GET", "?format=json")
links = payload.get("ocs", {}).get("data", []) if isinstance(payload, dict) else []
for link in links:
link_id = link.get("id") if isinstance(link, dict) else None
if link_id is not None:
self._external_api("DELETE", f"/sites/{link_id}?format=json")
sites = [
("Vaultwarden", "https://vault.bstein.dev"),
("Jellyfin", "https://stream.bstein.dev"),
("Gitea", "https://scm.bstein.dev"),
("Jenkins", "https://ci.bstein.dev"),
("Harbor", "https://registry.bstein.dev"),
("Vault", "https://secret.bstein.dev"),
("Jitsi", "https://meet.bstein.dev"),
("Grafana", "https://metrics.bstein.dev"),
("Chat LLM", "https://chat.ai.bstein.dev"),
("Vision", "https://draw.ai.bstein.dev"),
("STT/TTS", "https://talk.ai.bstein.dev"),
]
for name, url in sites:
self._external_api(
"POST",
"/sites?format=json",
data={
"name": name,
"url": url,
"lang": "",
"type": "link",
"device": "",
"icon": "",
"groups[]": "",
"redirect": "1",
},
)
except (ExecError, PodSelectionError, TimeoutError) as exc:
return {"status": "error", "detail": str(exc)}
except Exception as exc: # noqa: BLE001
return {"status": "error", "detail": str(exc)}
return {"status": "ok", "detail": "maintenance complete"}
return run_nextcloud_maintenance(self)
nextcloud = NextcloudService()

View File

@ -0,0 +1,106 @@
"""Mail synchronization helpers for Nextcloud account management."""
from __future__ import annotations
from dataclasses import dataclass
import re
from typing import Any
from ..settings import settings
def _extract_attr(attrs: Any, key: str) -> str:
if not isinstance(attrs, dict):
return ""
raw = attrs.get(key)
if isinstance(raw, list):
for item in raw:
if isinstance(item, str) and item.strip():
return item.strip()
return ""
if isinstance(raw, str) and raw.strip():
return raw.strip()
return ""
def _resolve_mailu_email(username: str, user: dict[str, Any]) -> str:
attrs = user.get("attributes")
mailu_email = _extract_attr(attrs, "mailu_email")
if mailu_email:
return mailu_email
email = user.get("email")
if isinstance(email, str) and email.strip():
email = email.strip()
if email.lower().endswith(f"@{settings.mailu_domain.lower()}"):
return email
return f"{username}@{settings.mailu_domain}"
def _parse_mail_export(output: str) -> list[tuple[str, str]]:
accounts: list[tuple[str, str]] = []
account_id = ""
for line in output.splitlines():
line = line.strip()
if not line:
continue
match = re.match(r"^Account\s+(\d+):", line, flags=re.IGNORECASE)
if match:
account_id = match.group(1)
continue
match = re.match(r"^-\s*E-?mail:\s*(\S+)", line, flags=re.IGNORECASE)
if match and account_id:
accounts.append((account_id, match.group(1)))
return accounts
def display_name(user: dict[str, Any]) -> str:
"""Return a human display name from Keycloak first/last name fields."""
first = user.get("firstName") if isinstance(user.get("firstName"), str) else ""
last = user.get("lastName") if isinstance(user.get("lastName"), str) else ""
first = first.strip()
last = last.strip()
if first and last:
return f"{first} {last}"
return last or first
@dataclass(frozen=True)
class NextcloudMailSyncSummary:
processed: int
created: int
updated: int
deleted: int
skipped: int
failures: int
detail: str = ""
@dataclass
class MailSyncCounters:
processed: int = 0
created: int = 0
updated: int = 0
deleted: int = 0
skipped: int = 0
failures: int = 0
last_error: str = ""
def summary(self) -> NextcloudMailSyncSummary:
return NextcloudMailSyncSummary(
processed=self.processed,
created=self.created,
updated=self.updated,
deleted=self.deleted,
skipped=self.skipped,
failures=self.failures,
detail=self.last_error,
)
def status(self) -> str:
return "ok" if self.failures == 0 else "error"
def record_failure(self, detail: str) -> None:
self.failures += 1
if detail and not self.last_error:
self.last_error = detail

View File

@ -0,0 +1,130 @@
"""Nextcloud maintenance task implementation."""
from __future__ import annotations
from typing import Any
from ..k8s.exec import ExecError
from ..k8s.pods import PodSelectionError
from ..settings import settings
def _run_shell(service: Any, script: str, check: bool = True) -> None:
service._executor.exec(
script,
timeout_sec=settings.nextcloud_exec_timeout_sec,
check=check,
)
def run_maintenance(service: Any) -> dict[str, Any]:
"""Run theming, app-link, quota, and filesystem maintenance for Nextcloud."""
if not settings.nextcloud_namespace:
raise RuntimeError("nextcloud maintenance not configured")
try:
_run_shell(
service,
"""
set -euo pipefail
if [ ! -d /var/www/html/lib ] && [ -d /usr/src/nextcloud/lib ]; then
if command -v rsync >/dev/null 2>&1; then
rsync -a --delete --exclude config --exclude data /usr/src/nextcloud/ /var/www/html/
else
cp -a /usr/src/nextcloud/. /var/www/html/
fi
fi
mkdir -p /var/www/html/data
chown 33:33 /var/www/html || true
chmod 775 /var/www/html || true
chown -R 33:33 /var/www/html/apps /var/www/html/custom_apps /var/www/html/data /var/www/html/config 2>/dev/null || true
""",
check=False,
)
service._occ(["config:app:set", "theming", "name", "--value", "Atlas Cloud"])
service._occ(["config:app:set", "theming", "slogan", "--value", "Unified access to Atlas services"])
theming_url = settings.nextcloud_url or "https://cloud.bstein.dev"
service._occ(["config:app:set", "theming", "url", "--value", theming_url])
service._occ(["config:app:set", "theming", "color", "--value", "#0f172a"])
service._occ(["config:app:set", "theming", "disable-user-theming", "--value", "yes"])
service._executor.exec(
["runuser", "-u", "www-data", "--", "php", "/var/www/html/occ", "app:install", "customcss"],
timeout_sec=settings.nextcloud_exec_timeout_sec,
check=False,
)
service._executor.exec(
["runuser", "-u", "www-data", "--", "php", "/var/www/html/occ", "app:enable", "customcss"],
timeout_sec=settings.nextcloud_exec_timeout_sec,
check=False,
)
mail_css = (
".mail-message-body, .mail-message-body pre, .mail-message-body code, .mail-message-body table {\n"
" font-family: \"Inter\", \"Source Sans 3\", \"Helvetica Neue\", Arial, sans-serif;\n"
" font-size: 14px;\n"
" line-height: 1.6;\n"
" color: var(--color-main-text);\n"
"}\n"
".mail-message-body pre {\n"
" background: rgba(15, 23, 42, 0.06);\n"
" padding: 12px;\n"
" border-radius: 8px;\n"
"}\n"
".mail-message-body blockquote {\n"
" border-left: 3px solid var(--color-border);\n"
" padding-left: 12px;\n"
" margin: 8px 0;\n"
" color: var(--color-text-lighter);\n"
"}\n"
".mail-message-body img {\n"
" max-width: 100%;\n"
" border-radius: 6px;\n"
"}\n"
)
service._occ(["config:app:set", "customcss", "css", "--value", mail_css])
service._occ(["config:app:set", "files", "default_quota", "--value", "250 GB"])
payload = service._external_api("GET", "?format=json")
links = payload.get("ocs", {}).get("data", []) if isinstance(payload, dict) else []
for link in links:
link_id = link.get("id") if isinstance(link, dict) else None
if link_id is not None:
service._external_api("DELETE", f"/sites/{link_id}?format=json")
sites = [
("Vaultwarden", "https://vault.bstein.dev"),
("Jellyfin", "https://stream.bstein.dev"),
("Gitea", "https://scm.bstein.dev"),
("Jenkins", "https://ci.bstein.dev"),
("Harbor", "https://registry.bstein.dev"),
("Vault", "https://secret.bstein.dev"),
("Jitsi", "https://meet.bstein.dev"),
("Grafana", "https://metrics.bstein.dev"),
("Chat LLM", "https://chat.ai.bstein.dev"),
("Vision", "https://draw.ai.bstein.dev"),
("STT/TTS", "https://talk.ai.bstein.dev"),
]
for name, url in sites:
service._external_api(
"POST",
"/sites?format=json",
data={
"name": name,
"url": url,
"lang": "",
"type": "link",
"device": "",
"icon": "",
"groups[]": "",
"redirect": "1",
},
)
except (ExecError, PodSelectionError, TimeoutError) as exc:
return {"status": "error", "detail": str(exc)}
except Exception as exc: # noqa: BLE001
return {"status": "error", "detail": str(exc)}
return {"status": "ok", "detail": "maintenance complete"}

View File

@ -24,6 +24,8 @@ HTTP_NOT_FOUND = 404
def parse_size(value: str) -> int:
"""Convert OpenSearch CAT index size text into bytes."""
if not value:
return 0
text = value.strip().lower()
@ -65,6 +67,8 @@ def _delete_index(client: httpx.Client, index: str) -> None:
def prune_indices() -> OpensearchPruneSummary:
"""Delete old OpenSearch indices until usage is under the configured limit."""
patterns = [p.strip() for p in settings.opensearch_index_patterns.split(",") if p.strip()]
if not patterns:
return OpensearchPruneSummary(0, 0, 0, detail="no patterns configured")

View File

@ -28,6 +28,8 @@ def _delete_pod(namespace: str, name: str) -> None:
def clean_finished_pods() -> PodCleanerSummary:
"""Delete succeeded and failed pods across namespaces."""
deleted = 0
skipped = 0
failures = 0

View File

@ -8,6 +8,9 @@ import httpx
from ..settings import settings
from ..utils.logging import get_logger
from .vault_policies import DEV_KV_POLICY as _DEV_KV_POLICY
from .vault_policies import K8S_ROLES as _K8S_ROLES
from .vault_policies import VAULT_ADMIN_POLICY as _VAULT_ADMIN_POLICY
logger = get_logger(__name__)
@ -45,264 +48,9 @@ def _build_policy(read_paths: str, write_paths: str) -> str:
)
return "\n".join(policy_parts).strip() + "\n"
_K8S_ROLES: list[dict[str, str]] = [
{
"role": "outline",
"namespace": "outline",
"service_accounts": "outline-vault",
"read_paths": "outline/* shared/postmark-relay",
"write_paths": "",
},
{
"role": "planka",
"namespace": "planka",
"service_accounts": "planka-vault",
"read_paths": "planka/* shared/postmark-relay",
"write_paths": "",
},
{
"role": "bstein-dev-home",
"namespace": "bstein-dev-home",
"service_accounts": "bstein-dev-home,bstein-dev-home-vault-sync",
"read_paths": "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay "
"mailu/mailu-initial-account-secret shared/harbor-pull",
"write_paths": "",
},
{
"role": "gitea",
"namespace": "gitea",
"service_accounts": "gitea-vault",
"read_paths": "gitea/*",
"write_paths": "",
},
{
"role": "vaultwarden",
"namespace": "vaultwarden",
"service_accounts": "vaultwarden-vault",
"read_paths": "vaultwarden/* mailu/mailu-initial-account-secret",
"write_paths": "",
},
{
"role": "sso",
"namespace": "sso",
"service_accounts": "sso-vault,sso-vault-sync,mas-secrets-ensure",
"read_paths": "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin "
"shared/portal-e2e-client shared/postmark-relay shared/harbor-pull",
"write_paths": "",
},
{
"role": "mailu-mailserver",
"namespace": "mailu-mailserver",
"service_accounts": "mailu-vault-sync",
"read_paths": "mailu/* shared/postmark-relay shared/harbor-pull",
"write_paths": "",
},
{
"role": "harbor",
"namespace": "harbor",
"service_accounts": "harbor-vault-sync",
"read_paths": "harbor/* shared/harbor-pull",
"write_paths": "",
},
{
"role": "nextcloud",
"namespace": "nextcloud",
"service_accounts": "nextcloud-vault",
"read_paths": "nextcloud/* shared/keycloak-admin shared/postmark-relay",
"write_paths": "",
},
{
"role": "comms",
"namespace": "comms",
"service_accounts": "comms-vault,atlasbot",
"read_paths": "comms/* shared/chat-ai-keys-runtime shared/harbor-pull",
"write_paths": "",
},
{
"role": "jenkins",
"namespace": "jenkins",
"service_accounts": "jenkins",
"read_paths": "jenkins/*",
"write_paths": "",
},
{
"role": "monitoring",
"namespace": "monitoring",
"service_accounts": "monitoring-vault-sync",
"read_paths": "monitoring/* shared/postmark-relay shared/harbor-pull",
"write_paths": "",
},
{
"role": "logging",
"namespace": "logging",
"service_accounts": "logging-vault-sync",
"read_paths": "logging/* shared/harbor-pull",
"write_paths": "",
},
{
"role": "pegasus",
"namespace": "jellyfin",
"service_accounts": "pegasus-vault-sync",
"read_paths": "pegasus/* shared/harbor-pull",
"write_paths": "",
},
{
"role": "crypto",
"namespace": "crypto",
"service_accounts": "crypto-vault-sync",
"read_paths": "crypto/* shared/harbor-pull",
"write_paths": "",
},
{
"role": "health",
"namespace": "health",
"service_accounts": "health-vault-sync",
"read_paths": "health/*",
"write_paths": "",
},
{
"role": "maintenance",
"namespace": "maintenance",
"service_accounts": "ariadne,maintenance-vault-sync",
"read_paths": "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret "
"mailu/mailu-initial-account-secret comms/synapse-admin shared/harbor-pull",
"write_paths": "",
},
{
"role": "finance",
"namespace": "finance",
"service_accounts": "finance-vault",
"read_paths": "finance/* shared/postmark-relay",
"write_paths": "",
},
{
"role": "finance-secrets",
"namespace": "finance",
"service_accounts": "finance-secrets-ensure",
"read_paths": "",
"write_paths": "finance/*",
},
{
"role": "longhorn",
"namespace": "longhorn-system",
"service_accounts": "longhorn-vault,longhorn-vault-sync",
"read_paths": "longhorn/* shared/harbor-pull",
"write_paths": "",
},
{
"role": "postgres",
"namespace": "postgres",
"service_accounts": "postgres-vault",
"read_paths": "postgres/postgres-db",
"write_paths": "",
},
{
"role": "vault",
"namespace": "vault",
"service_accounts": "vault",
"read_paths": "vault/*",
"write_paths": "",
},
{
"role": "sso-secrets",
"namespace": "sso",
"service_accounts": "mas-secrets-ensure",
"read_paths": "shared/keycloak-admin",
"write_paths": "harbor/harbor-oidc vault/vault-oidc-config comms/synapse-oidc "
"logging/oauth2-proxy-logs-oidc finance/actual-oidc",
},
{
"role": "crypto-secrets",
"namespace": "crypto",
"service_accounts": "crypto-secrets-ensure",
"read_paths": "",
"write_paths": "crypto/wallet-monero-temp-rpc-auth",
},
{
"role": "comms-secrets",
"namespace": "comms",
"service_accounts": "comms-secrets-ensure,mas-db-ensure,mas-admin-client-secret-writer,othrys-synapse-signingkey-job",
"read_paths": "",
"write_paths": "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon "
"comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin comms/synapse-registration "
"comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey",
},
]
_VAULT_ADMIN_POLICY = """
path "sys/auth" {
capabilities = ["read"]
}
path "sys/auth/*" {
capabilities = ["create", "update", "delete", "sudo", "read"]
}
path "auth/kubernetes/*" {
capabilities = ["create", "update", "read"]
}
path "auth/oidc/*" {
capabilities = ["create", "update", "read"]
}
path "sys/policies/acl" {
capabilities = ["list"]
}
path "sys/policies/acl/*" {
capabilities = ["create", "update", "read"]
}
path "sys/internal/ui/mounts" {
capabilities = ["read"]
}
path "sys/mounts" {
capabilities = ["read"]
}
path "sys/mounts/auth/*" {
capabilities = ["read", "update", "sudo"]
}
path "kv/data/atlas/vault/*" {
capabilities = ["read"]
}
path "kv/metadata/atlas/vault/*" {
capabilities = ["list"]
}
path "kv/data/*" {
capabilities = ["create", "update", "read", "delete", "patch"]
}
path "kv/metadata" {
capabilities = ["list"]
}
path "kv/metadata/*" {
capabilities = ["read", "list", "delete"]
}
path "kv/data/atlas/shared/*" {
capabilities = ["create", "update", "read", "patch"]
}
path "kv/metadata/atlas/shared/*" {
capabilities = ["list"]
}
""".strip()
_DEV_KV_POLICY = """
path "kv/metadata" {
capabilities = ["list"]
}
path "kv/metadata/atlas" {
capabilities = ["list"]
}
path "kv/metadata/atlas/shared" {
capabilities = ["list"]
}
path "kv/metadata/atlas/shared/*" {
capabilities = ["list"]
}
path "kv/data/atlas/shared/*" {
capabilities = ["read"]
}
""".strip()
class VaultClient:
"""Minimal HTTP client for Vault API requests."""
def __init__(self, base_url: str, token: str | None = None) -> None:
self._base_url = base_url.rstrip("/")
self._token = token
@ -321,6 +69,8 @@ class VaultClient:
class VaultService:
"""Ensure Vault is initialized, unsealed, and configured for Atlas access."""
def __init__(self) -> None:
self._token: str | None = None

View File

@ -0,0 +1,258 @@
"""Vault role and policy definitions used by Ariadne Vault reconciliation."""
from __future__ import annotations
K8S_ROLES: list[dict[str, str]] = [
{
"role": "outline",
"namespace": "outline",
"service_accounts": "outline-vault",
"read_paths": "outline/* shared/postmark-relay",
"write_paths": "",
},
{
"role": "planka",
"namespace": "planka",
"service_accounts": "planka-vault",
"read_paths": "planka/* shared/postmark-relay",
"write_paths": "",
},
{
"role": "bstein-dev-home",
"namespace": "bstein-dev-home",
"service_accounts": "bstein-dev-home,bstein-dev-home-vault-sync",
"read_paths": "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay "
"mailu/mailu-initial-account-secret shared/harbor-pull",
"write_paths": "",
},
{
"role": "gitea",
"namespace": "gitea",
"service_accounts": "gitea-vault",
"read_paths": "gitea/*",
"write_paths": "",
},
{
"role": "vaultwarden",
"namespace": "vaultwarden",
"service_accounts": "vaultwarden-vault",
"read_paths": "vaultwarden/* mailu/mailu-initial-account-secret",
"write_paths": "",
},
{
"role": "sso",
"namespace": "sso",
"service_accounts": "sso-vault,sso-vault-sync,mas-secrets-ensure",
"read_paths": "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin "
"shared/portal-e2e-client shared/postmark-relay shared/harbor-pull",
"write_paths": "",
},
{
"role": "mailu-mailserver",
"namespace": "mailu-mailserver",
"service_accounts": "mailu-vault-sync",
"read_paths": "mailu/* shared/postmark-relay shared/harbor-pull",
"write_paths": "",
},
{
"role": "harbor",
"namespace": "harbor",
"service_accounts": "harbor-vault-sync",
"read_paths": "harbor/* shared/harbor-pull",
"write_paths": "",
},
{
"role": "nextcloud",
"namespace": "nextcloud",
"service_accounts": "nextcloud-vault",
"read_paths": "nextcloud/* shared/keycloak-admin shared/postmark-relay",
"write_paths": "",
},
{
"role": "comms",
"namespace": "comms",
"service_accounts": "comms-vault,atlasbot",
"read_paths": "comms/* shared/chat-ai-keys-runtime shared/harbor-pull",
"write_paths": "",
},
{
"role": "jenkins",
"namespace": "jenkins",
"service_accounts": "jenkins",
"read_paths": "jenkins/*",
"write_paths": "",
},
{
"role": "monitoring",
"namespace": "monitoring",
"service_accounts": "monitoring-vault-sync",
"read_paths": "monitoring/* shared/postmark-relay shared/harbor-pull",
"write_paths": "",
},
{
"role": "logging",
"namespace": "logging",
"service_accounts": "logging-vault-sync",
"read_paths": "logging/* shared/harbor-pull",
"write_paths": "",
},
{
"role": "pegasus",
"namespace": "jellyfin",
"service_accounts": "pegasus-vault-sync",
"read_paths": "pegasus/* shared/harbor-pull",
"write_paths": "",
},
{
"role": "crypto",
"namespace": "crypto",
"service_accounts": "crypto-vault-sync",
"read_paths": "crypto/* shared/harbor-pull",
"write_paths": "",
},
{
"role": "health",
"namespace": "health",
"service_accounts": "health-vault-sync",
"read_paths": "health/*",
"write_paths": "",
},
{
"role": "maintenance",
"namespace": "maintenance",
"service_accounts": "ariadne,maintenance-vault-sync",
"read_paths": "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret "
"mailu/mailu-initial-account-secret comms/synapse-admin shared/harbor-pull",
"write_paths": "",
},
{
"role": "finance",
"namespace": "finance",
"service_accounts": "finance-vault",
"read_paths": "finance/* shared/postmark-relay",
"write_paths": "",
},
{
"role": "finance-secrets",
"namespace": "finance",
"service_accounts": "finance-secrets-ensure",
"read_paths": "",
"write_paths": "finance/*",
},
{
"role": "longhorn",
"namespace": "longhorn-system",
"service_accounts": "longhorn-vault,longhorn-vault-sync",
"read_paths": "longhorn/* shared/harbor-pull",
"write_paths": "",
},
{
"role": "postgres",
"namespace": "postgres",
"service_accounts": "postgres-vault",
"read_paths": "postgres/postgres-db",
"write_paths": "",
},
{
"role": "vault",
"namespace": "vault",
"service_accounts": "vault",
"read_paths": "vault/*",
"write_paths": "",
},
{
"role": "sso-secrets",
"namespace": "sso",
"service_accounts": "mas-secrets-ensure",
"read_paths": "shared/keycloak-admin",
"write_paths": "harbor/harbor-oidc vault/vault-oidc-config comms/synapse-oidc "
"logging/oauth2-proxy-logs-oidc finance/actual-oidc",
},
{
"role": "crypto-secrets",
"namespace": "crypto",
"service_accounts": "crypto-secrets-ensure",
"read_paths": "",
"write_paths": "crypto/wallet-monero-temp-rpc-auth",
},
{
"role": "comms-secrets",
"namespace": "comms",
"service_accounts": "comms-secrets-ensure,mas-db-ensure,mas-admin-client-secret-writer,othrys-synapse-signingkey-job",
"read_paths": "",
"write_paths": "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon "
"comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin comms/synapse-registration "
"comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey",
},
]
VAULT_ADMIN_POLICY = """
path "sys/auth" {
capabilities = ["read"]
}
path "sys/auth/*" {
capabilities = ["create", "update", "delete", "sudo", "read"]
}
path "auth/kubernetes/*" {
capabilities = ["create", "update", "read"]
}
path "auth/oidc/*" {
capabilities = ["create", "update", "read"]
}
path "sys/policies/acl" {
capabilities = ["list"]
}
path "sys/policies/acl/*" {
capabilities = ["create", "update", "read"]
}
path "sys/internal/ui/mounts" {
capabilities = ["read"]
}
path "sys/mounts" {
capabilities = ["read"]
}
path "sys/mounts/auth/*" {
capabilities = ["read", "update", "sudo"]
}
path "kv/data/atlas/vault/*" {
capabilities = ["read"]
}
path "kv/metadata/atlas/vault/*" {
capabilities = ["list"]
}
path "kv/data/*" {
capabilities = ["create", "update", "read", "delete", "patch"]
}
path "kv/metadata" {
capabilities = ["list"]
}
path "kv/metadata/*" {
capabilities = ["read", "list", "delete"]
}
path "kv/data/atlas/shared/*" {
capabilities = ["create", "update", "read", "patch"]
}
path "kv/metadata/atlas/shared/*" {
capabilities = ["list"]
}
""".strip()
DEV_KV_POLICY = """
path "kv/metadata" {
capabilities = ["list"]
}
path "kv/metadata/atlas" {
capabilities = ["list"]
}
path "kv/metadata/atlas/shared" {
capabilities = ["list"]
}
path "kv/metadata/atlas/shared/*" {
capabilities = ["list"]
}
path "kv/data/atlas/shared/*" {
capabilities = ["read"]
}
""".strip()

View File

@ -33,6 +33,8 @@ class VaultwardenLookup:
class VaultwardenService:
"""Invite eligible users to Vaultwarden through the admin interface."""
def __init__(self) -> None:
self._admin_lock = threading.Lock()
self._admin_client: httpx.Client | None = None

View File

@ -242,17 +242,12 @@ def _handle_existing_invite(state: VaultwardenInviteState) -> bool:
state.counters.skipped += 1
return True
if not _should_refresh_invite(state.synced_ts):
if not state.synced_at:
_set_sync_status(state.username, state.status)
state.counters.skipped += 1
return True
return False
def _sync_user(
user: dict[str, Any],
counters: VaultwardenSyncCounters,
) -> tuple[str | None, bool]:
def _sync_user(user: dict[str, Any], counters: VaultwardenSyncCounters) -> tuple[str | None, bool]:
status: str | None = None
ok = False
normalized = _normalize_user(user)
@ -297,6 +292,8 @@ def _sync_user(
def run_vaultwarden_sync() -> VaultwardenSyncSummary:
"""Process pending Vaultwarden invite failures until the queue is healthy."""
consecutive_failures = 0
counters = VaultwardenSyncCounters()

View File

@ -3,7 +3,6 @@ from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any
import textwrap
from ..k8s.exec import ExecError, PodExecutor
from ..k8s.pods import PodSelectionError
@ -12,6 +11,8 @@ from ..utils.logging import get_logger
from ..utils.passwords import random_password
from .keycloak_admin import keycloak_admin
from .mailu import mailu
from .wger_scripts import WGER_PASSWORD_CHECK_SCRIPT as _WGER_PASSWORD_CHECK_SCRIPT
from .wger_scripts import WGER_SYNC_SCRIPT as _WGER_SYNC_SCRIPT
EXIT_PASSWORD_MATCH = 0
@ -23,179 +24,6 @@ WGER_PASSWORD_ROTATED_ATTR = "wger_password_rotated_at"
logger = get_logger(__name__)
_WGER_SYNC_SCRIPT = textwrap.dedent(
"""
from __future__ import annotations
import os
import sys
import django
def _env(name: str, default: str = "") -> str:
value = os.getenv(name, default)
return value.strip() if isinstance(value, str) else ""
def _setup_django() -> None:
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.main")
django.setup()
def _set_default_gym(user) -> None:
try:
from wger.gym.models import GymConfig
except Exception:
return
try:
config = GymConfig.objects.first()
except Exception:
return
if not config or not getattr(config, "default_gym", None):
return
profile = getattr(user, "userprofile", None)
if not profile or getattr(profile, "gym", None):
return
profile.gym = config.default_gym
profile.save()
def _ensure_profile(user) -> None:
profile = getattr(user, "userprofile", None)
if not profile:
return
if hasattr(profile, "email_verified") and not profile.email_verified:
profile.email_verified = True
if hasattr(profile, "is_temporary") and profile.is_temporary:
profile.is_temporary = False
profile.save()
def _ensure_admin(username: str, password: str, email: str) -> None:
from django.contrib.auth.models import User
if not username or not password:
raise RuntimeError("admin username/password missing")
user, created = User.objects.get_or_create(username=username)
if created:
user.is_active = True
if not user.is_staff:
user.is_staff = True
if email:
user.email = email
user.set_password(password)
user.save()
_ensure_profile(user)
_set_default_gym(user)
print(f"ensured admin user {username}")
def _ensure_user(username: str, password: str, email: str) -> None:
from django.contrib.auth.models import User
if not username or not password:
raise RuntimeError("username/password missing")
user, created = User.objects.get_or_create(username=username)
if created:
user.is_active = True
if email and user.email != email:
user.email = email
user.set_password(password)
user.save()
_ensure_profile(user)
_set_default_gym(user)
action = "created" if created else "updated"
print(f"{action} user {username}")
def main() -> int:
admin_user = _env("WGER_ADMIN_USERNAME")
admin_password = _env("WGER_ADMIN_PASSWORD")
admin_email = _env("WGER_ADMIN_EMAIL")
username = _env("WGER_USERNAME") or _env("ONLY_USERNAME")
password = _env("WGER_PASSWORD")
email = _env("WGER_EMAIL")
if not any([admin_user and admin_password, username and password]):
print("no admin or user payload provided; exiting")
return 0
_setup_django()
if admin_user and admin_password:
_ensure_admin(admin_user, admin_password, admin_email)
if username and password:
_ensure_user(username, password, email)
return 0
if __name__ == "__main__":
sys.exit(main())
"""
).strip()
_WGER_PASSWORD_CHECK_SCRIPT = textwrap.dedent(
"""
from __future__ import annotations
import os
import sys
import django
def _env(name: str, default: str = "") -> str:
value = os.getenv(name, default)
return value.strip() if isinstance(value, str) else ""
def _setup_django() -> None:
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.main")
django.setup()
def main() -> int:
username = _env("WGER_USERNAME")
password = _env("WGER_PASSWORD")
if not username or not password:
print("missing username or password")
return 2
_setup_django()
from django.contrib.auth.models import User
user = User.objects.filter(username=username).first()
if not user:
print(f"user {username} missing")
return 3
if user.check_password(password):
print("password match")
return 0
print("password mismatch")
return 1
if __name__ == "__main__":
sys.exit(main())
"""
).strip()
def _wger_exec_command() -> str:
bootstrap = ". /vault/secrets/wger-env >/dev/null 2>&1 || true"
@ -446,6 +274,8 @@ def _rotation_check_input(username: str) -> tuple[WgerSyncInput | UserSyncOutcom
class WgerService:
"""Synchronize Keycloak users and password rotations into Wger."""
def __init__(self) -> None:
self._executor = PodExecutor(
settings.wger_namespace,

View File

@ -0,0 +1,180 @@
"""Embedded scripts executed inside the wger application pod."""
from __future__ import annotations
import textwrap
WGER_SYNC_SCRIPT = textwrap.dedent(
"""
from __future__ import annotations
import os
import sys
import django
def _env(name: str, default: str = "") -> str:
value = os.getenv(name, default)
return value.strip() if isinstance(value, str) else ""
def _setup_django() -> None:
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.main")
django.setup()
def _set_default_gym(user) -> None:
try:
from wger.gym.models import GymConfig
except Exception:
return
try:
config = GymConfig.objects.first()
except Exception:
return
if not config or not getattr(config, "default_gym", None):
return
profile = getattr(user, "userprofile", None)
if not profile or getattr(profile, "gym", None):
return
profile.gym = config.default_gym
profile.save()
def _ensure_profile(user) -> None:
profile = getattr(user, "userprofile", None)
if not profile:
return
if hasattr(profile, "email_verified") and not profile.email_verified:
profile.email_verified = True
if hasattr(profile, "is_temporary") and profile.is_temporary:
profile.is_temporary = False
profile.save()
def _ensure_admin(username: str, password: str, email: str) -> None:
from django.contrib.auth.models import User
if not username or not password:
raise RuntimeError("admin username/password missing")
user, created = User.objects.get_or_create(username=username)
if created:
user.is_active = True
if not user.is_staff:
user.is_staff = True
if email:
user.email = email
user.set_password(password)
user.save()
_ensure_profile(user)
_set_default_gym(user)
print(f"ensured admin user {username}")
def _ensure_user(username: str, password: str, email: str) -> None:
from django.contrib.auth.models import User
if not username or not password:
raise RuntimeError("username/password missing")
user, created = User.objects.get_or_create(username=username)
if created:
user.is_active = True
if email and user.email != email:
user.email = email
user.set_password(password)
user.save()
_ensure_profile(user)
_set_default_gym(user)
action = "created" if created else "updated"
print(f"{action} user {username}")
def main() -> int:
admin_user = _env("WGER_ADMIN_USERNAME")
admin_password = _env("WGER_ADMIN_PASSWORD")
admin_email = _env("WGER_ADMIN_EMAIL")
username = _env("WGER_USERNAME") or _env("ONLY_USERNAME")
password = _env("WGER_PASSWORD")
email = _env("WGER_EMAIL")
if not any([admin_user and admin_password, username and password]):
print("no admin or user payload provided; exiting")
return 0
_setup_django()
if admin_user and admin_password:
_ensure_admin(admin_user, admin_password, admin_email)
if username and password:
_ensure_user(username, password, email)
return 0
if __name__ == "__main__":
sys.exit(main())
"""
).strip()
WGER_PASSWORD_CHECK_SCRIPT = textwrap.dedent(
"""
from __future__ import annotations
import os
import sys
import django
def _env(name: str, default: str = "") -> str:
value = os.getenv(name, default)
return value.strip() if isinstance(value, str) else ""
def _setup_django() -> None:
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings.main")
django.setup()
def main() -> int:
username = _env("WGER_USERNAME")
password = _env("WGER_PASSWORD")
if not username or not password:
print("missing username or password")
return 2
_setup_django()
from django.contrib.auth.models import User
user = User.objects.filter(username=username).first()
if not user:
print(f"user {username} missing")
return 3
if user.check_password(password):
print("password match")
return 0
print("password mismatch")
return 1
if __name__ == "__main__":
sys.exit(main())
"""
).strip()

View File

@ -1,33 +1,28 @@
from __future__ import annotations
from dataclasses import dataclass
import os
from typing import Any
def _env(name: str, default: str = "") -> str:
value = os.getenv(name, default)
return value.strip() if isinstance(value, str) else default
def _env_bool(name: str, default: str = "false") -> bool:
return _env(name, default).lower() in {"1", "true", "yes", "y", "on"}
def _env_int(name: str, default: int) -> int:
raw = _env(name, str(default))
try:
return int(raw)
except ValueError:
return default
def _env_float(name: str, default: float) -> float:
raw = _env(name, str(default))
try:
return float(raw)
except ValueError:
return default
from .settings_env import _env, _env_bool, _env_float, _env_int
from .settings_sections import (
_cluster_state_config,
_comms_config,
_firefly_config,
_image_sweeper_config,
_jenkins_build_weather_config,
_jenkins_workspace_cleanup_config,
_keycloak_config,
_mailu_config,
_metis_config,
_nextcloud_config,
_opensearch_config,
_platform_quality_probe_config,
_portal_group_config,
_schedule_config,
_smtp_config,
_vault_config,
_vaultwarden_config,
_wger_config,
)
@dataclass(frozen=True)
@ -168,6 +163,15 @@ class Settings:
platform_quality_probe_wait_timeout_sec: float
platform_quality_probe_pushgateway_url: str
platform_quality_probe_http_timeout_sec: int
jenkins_base_url: str
jenkins_api_user: str
jenkins_api_token: str
jenkins_api_timeout_sec: float
jenkins_workspace_namespace: str
jenkins_workspace_pvc_prefix: str
jenkins_workspace_cleanup_min_age_hours: float
jenkins_workspace_cleanup_dry_run: bool
jenkins_workspace_cleanup_max_deletions_per_run: int
vaultwarden_namespace: str
vaultwarden_pod_label: str
@ -234,6 +238,8 @@ class Settings:
metis_token_sync_vault_k8s_role: str
metis_k3s_token_sync_cron: str
platform_quality_suite_probe_cron: str
jenkins_build_weather_cron: str
jenkins_workspace_cleanup_cron: str
opensearch_url: str
opensearch_limit_bytes: int
@ -242,334 +248,26 @@ class Settings:
metrics_path: str
@classmethod
def _keycloak_config(cls) -> dict[str, Any]:
keycloak_url = _env("KEYCLOAK_URL", "https://sso.bstein.dev").rstrip("/")
keycloak_realm = _env("KEYCLOAK_REALM", "atlas")
keycloak_client_id = _env("KEYCLOAK_CLIENT_ID", "bstein-dev-home")
keycloak_issuer = _env("KEYCLOAK_ISSUER", f"{keycloak_url}/realms/{keycloak_realm}").rstrip("/")
keycloak_jwks_url = _env("KEYCLOAK_JWKS_URL", f"{keycloak_issuer}/protocol/openid-connect/certs").rstrip("/")
return {
"keycloak_url": keycloak_url,
"keycloak_realm": keycloak_realm,
"keycloak_client_id": keycloak_client_id,
"keycloak_issuer": keycloak_issuer,
"keycloak_jwks_url": keycloak_jwks_url,
"keycloak_admin_url": _env("KEYCLOAK_ADMIN_URL", keycloak_url).rstrip("/"),
"keycloak_admin_realm": _env("KEYCLOAK_ADMIN_REALM", keycloak_realm),
"keycloak_admin_client_id": _env("KEYCLOAK_ADMIN_CLIENT_ID", ""),
"keycloak_admin_client_secret": _env("KEYCLOAK_ADMIN_CLIENT_SECRET", ""),
}
@classmethod
def _portal_group_config(cls) -> dict[str, Any]:
return {
"portal_admin_users": [u for u in (_env("PORTAL_ADMIN_USERS", "bstein")).split(",") if u.strip()],
"portal_admin_groups": [g for g in (_env("PORTAL_ADMIN_GROUPS", "admin")).split(",") if g.strip()],
"account_allowed_groups": [
g for g in (_env("ACCOUNT_ALLOWED_GROUPS", "dev,admin")).split(",") if g.strip()
],
"allowed_flag_groups": [g for g in (_env("ALLOWED_FLAG_GROUPS", "demo,test")).split(",") if g.strip()],
"default_user_groups": [g for g in (_env("DEFAULT_USER_GROUPS", "dev")).split(",") if g.strip()],
}
@classmethod
def _mailu_config(cls) -> dict[str, Any]:
mailu_domain = _env("MAILU_DOMAIN", "bstein.dev")
return {
"mailu_domain": mailu_domain,
"mailu_sync_url": _env(
"MAILU_SYNC_URL",
"http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events",
).rstrip("/"),
"mailu_event_min_interval_sec": _env_float("MAILU_EVENT_MIN_INTERVAL_SEC", 10.0),
"mailu_sync_wait_timeout_sec": _env_float("MAILU_SYNC_WAIT_TIMEOUT_SEC", 60.0),
"mailu_mailbox_wait_timeout_sec": _env_float("MAILU_MAILBOX_WAIT_TIMEOUT_SEC", 60.0),
"mailu_db_host": _env("MAILU_DB_HOST", "postgres-service.postgres.svc.cluster.local"),
"mailu_db_port": _env_int("MAILU_DB_PORT", 5432),
"mailu_db_name": _env("MAILU_DB_NAME", "mailu"),
"mailu_db_user": _env("MAILU_DB_USER", "mailu"),
"mailu_db_password": _env("MAILU_DB_PASSWORD", ""),
"mailu_host": _env("MAILU_HOST", f"mail.{mailu_domain}"),
"mailu_default_quota": _env_int("MAILU_DEFAULT_QUOTA", 20000000000),
"mailu_system_users": [u for u in _env("MAILU_SYSTEM_USERS", "").split(",") if u.strip()],
"mailu_system_password": _env("MAILU_SYSTEM_PASSWORD", ""),
}
@classmethod
def _smtp_config(cls, mailu_domain: str) -> dict[str, Any]:
return {
"smtp_host": _env("SMTP_HOST", ""),
"smtp_port": _env_int("SMTP_PORT", 25),
"smtp_username": _env("SMTP_USERNAME", ""),
"smtp_password": _env("SMTP_PASSWORD", ""),
"smtp_starttls": _env_bool("SMTP_STARTTLS", "false"),
"smtp_use_tls": _env_bool("SMTP_USE_TLS", "false"),
"smtp_from": _env("SMTP_FROM", f"postmaster@{mailu_domain}"),
"smtp_timeout_sec": _env_float("SMTP_TIMEOUT_SEC", 10.0),
"welcome_email_enabled": _env_bool("WELCOME_EMAIL_ENABLED", "true"),
}
@classmethod
def _nextcloud_config(cls) -> dict[str, Any]:
return {
"nextcloud_namespace": _env("NEXTCLOUD_NAMESPACE", "nextcloud"),
"nextcloud_pod_label": _env("NEXTCLOUD_POD_LABEL", "app=nextcloud"),
"nextcloud_container": _env("NEXTCLOUD_CONTAINER", "nextcloud"),
"nextcloud_exec_timeout_sec": _env_float("NEXTCLOUD_EXEC_TIMEOUT_SEC", 120.0),
"nextcloud_db_host": _env("NEXTCLOUD_DB_HOST", "postgres-service.postgres.svc.cluster.local"),
"nextcloud_db_port": _env_int("NEXTCLOUD_DB_PORT", 5432),
"nextcloud_db_name": _env("NEXTCLOUD_DB_NAME", "nextcloud"),
"nextcloud_db_user": _env("NEXTCLOUD_DB_USER", "nextcloud"),
"nextcloud_db_password": _env("NEXTCLOUD_DB_PASSWORD", ""),
"nextcloud_url": _env("NEXTCLOUD_URL", "https://cloud.bstein.dev").rstrip("/"),
"nextcloud_admin_user": _env("NEXTCLOUD_ADMIN_USER", ""),
"nextcloud_admin_password": _env("NEXTCLOUD_ADMIN_PASSWORD", ""),
}
@classmethod
def _wger_config(cls) -> dict[str, Any]:
return {
"wger_namespace": _env("WGER_NAMESPACE", "health"),
"wger_user_sync_wait_timeout_sec": _env_float("WGER_USER_SYNC_WAIT_TIMEOUT_SEC", 60.0),
"wger_pod_label": _env("WGER_POD_LABEL", "app=wger"),
"wger_container": _env("WGER_CONTAINER", "wger"),
"wger_admin_username": _env("WGER_ADMIN_USERNAME", ""),
"wger_admin_password": _env("WGER_ADMIN_PASSWORD", ""),
"wger_admin_email": _env("WGER_ADMIN_EMAIL", ""),
}
@classmethod
def _firefly_config(cls) -> dict[str, Any]:
return {
"firefly_namespace": _env("FIREFLY_NAMESPACE", "finance"),
"firefly_user_sync_wait_timeout_sec": _env_float("FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC", 90.0),
"firefly_pod_label": _env("FIREFLY_POD_LABEL", "app=firefly"),
"firefly_container": _env("FIREFLY_CONTAINER", "firefly"),
"firefly_cron_base_url": _env(
"FIREFLY_CRON_BASE_URL",
"http://firefly.finance.svc.cluster.local/api/v1/cron",
),
"firefly_cron_token": _env("FIREFLY_CRON_TOKEN", ""),
"firefly_cron_timeout_sec": _env_float("FIREFLY_CRON_TIMEOUT_SEC", 30.0),
}
@classmethod
def _vault_config(cls) -> dict[str, Any]:
return {
"vault_namespace": _env("VAULT_NAMESPACE", "vault"),
"vault_addr": _env("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/"),
"vault_token": _env("VAULT_TOKEN", ""),
"vault_k8s_role": _env("VAULT_K8S_ROLE", "vault"),
"vault_k8s_role_ttl": _env("VAULT_K8S_ROLE_TTL", "1h"),
"vault_k8s_token_reviewer_jwt": _env("VAULT_K8S_TOKEN_REVIEWER_JWT", ""),
"vault_k8s_token_reviewer_jwt_file": _env("VAULT_K8S_TOKEN_REVIEWER_JWT_FILE", ""),
"vault_oidc_discovery_url": _env("VAULT_OIDC_DISCOVERY_URL", ""),
"vault_oidc_client_id": _env("VAULT_OIDC_CLIENT_ID", ""),
"vault_oidc_client_secret": _env("VAULT_OIDC_CLIENT_SECRET", ""),
"vault_oidc_default_role": _env("VAULT_OIDC_DEFAULT_ROLE", "admin"),
"vault_oidc_scopes": _env("VAULT_OIDC_SCOPES", "openid profile email groups"),
"vault_oidc_user_claim": _env("VAULT_OIDC_USER_CLAIM", "preferred_username"),
"vault_oidc_groups_claim": _env("VAULT_OIDC_GROUPS_CLAIM", "groups"),
"vault_oidc_token_policies": _env("VAULT_OIDC_TOKEN_POLICIES", ""),
"vault_oidc_admin_group": _env("VAULT_OIDC_ADMIN_GROUP", "admin"),
"vault_oidc_admin_policies": _env("VAULT_OIDC_ADMIN_POLICIES", "default,vault-admin"),
"vault_oidc_dev_group": _env("VAULT_OIDC_DEV_GROUP", "dev"),
"vault_oidc_dev_policies": _env("VAULT_OIDC_DEV_POLICIES", "default,dev-kv"),
"vault_oidc_user_group": _env("VAULT_OIDC_USER_GROUP", ""),
"vault_oidc_user_policies": _env("VAULT_OIDC_USER_POLICIES", ""),
"vault_oidc_redirect_uris": _env(
"VAULT_OIDC_REDIRECT_URIS",
"https://secret.bstein.dev/ui/vault/auth/oidc/oidc/callback",
),
"vault_oidc_bound_audiences": _env("VAULT_OIDC_BOUND_AUDIENCES", ""),
"vault_oidc_bound_claims_type": _env("VAULT_OIDC_BOUND_CLAIMS_TYPE", "string"),
}
@classmethod
def _comms_config(cls) -> dict[str, Any]:
return {
"comms_namespace": _env("COMMS_NAMESPACE", "comms"),
"comms_synapse_base": _env(
"COMMS_SYNAPSE_BASE",
"http://othrys-synapse-matrix-synapse:8008",
).rstrip("/"),
"comms_auth_base": _env(
"COMMS_AUTH_BASE",
"http://matrix-authentication-service:8080",
).rstrip("/"),
"comms_mas_admin_api_base": _env(
"COMMS_MAS_ADMIN_API_BASE",
"http://matrix-authentication-service:8081/api/admin/v1",
).rstrip("/"),
"comms_mas_token_url": _env(
"COMMS_MAS_TOKEN_URL",
"http://matrix-authentication-service:8080/oauth2/token",
),
"comms_mas_admin_client_id": _env("COMMS_MAS_ADMIN_CLIENT_ID", "01KDXMVQBQ5JNY6SEJPZW6Z8BM"),
"comms_mas_admin_client_secret": _env("COMMS_MAS_ADMIN_CLIENT_SECRET", ""),
"comms_server_name": _env("COMMS_SERVER_NAME", "live.bstein.dev"),
"comms_room_alias": _env("COMMS_ROOM_ALIAS", "#othrys:live.bstein.dev"),
"comms_room_name": _env("COMMS_ROOM_NAME", "Othrys"),
"comms_pin_message": _env(
"COMMS_PIN_MESSAGE",
"Invite guests: share https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join and choose 'Continue' -> 'Join as guest'.",
),
"comms_seeder_user": _env("COMMS_SEEDER_USER", "othrys-seeder"),
"comms_seeder_password": _env("COMMS_SEEDER_PASSWORD", ""),
"comms_bot_user": _env("COMMS_BOT_USER", "atlasbot"),
"comms_bot_password": _env("COMMS_BOT_PASSWORD", ""),
"comms_synapse_db_host": _env(
"COMMS_SYNAPSE_DB_HOST",
"postgres-service.postgres.svc.cluster.local",
),
"comms_synapse_db_port": _env_int("COMMS_SYNAPSE_DB_PORT", 5432),
"comms_synapse_db_name": _env("COMMS_SYNAPSE_DB_NAME", "synapse"),
"comms_synapse_db_user": _env("COMMS_SYNAPSE_DB_USER", "synapse"),
"comms_synapse_db_password": _env("COMMS_SYNAPSE_DB_PASSWORD", ""),
"comms_synapse_admin_token": _env("COMMS_SYNAPSE_ADMIN_TOKEN", ""),
"comms_timeout_sec": _env_float("COMMS_TIMEOUT_SEC", 30.0),
"comms_guest_stale_days": _env_int("COMMS_GUEST_STALE_DAYS", 14),
}
@classmethod
def _image_sweeper_config(cls) -> dict[str, Any]:
return {
"image_sweeper_namespace": _env("IMAGE_SWEEPER_NAMESPACE", "maintenance"),
"image_sweeper_service_account": _env("IMAGE_SWEEPER_SERVICE_ACCOUNT", "node-image-sweeper"),
"image_sweeper_job_ttl_sec": _env_int("IMAGE_SWEEPER_JOB_TTL_SEC", 3600),
"image_sweeper_wait_timeout_sec": _env_float("IMAGE_SWEEPER_WAIT_TIMEOUT_SEC", 1200.0),
}
@classmethod
def _platform_quality_probe_config(cls) -> dict[str, Any]:
return {
"platform_quality_probe_namespace": _env("PLATFORM_QUALITY_PROBE_NAMESPACE", "monitoring"),
"platform_quality_probe_script_configmap": _env(
"PLATFORM_QUALITY_PROBE_SCRIPT_CONFIGMAP",
"platform-quality-suite-probe-script",
),
"platform_quality_probe_image": _env("PLATFORM_QUALITY_PROBE_IMAGE", "curlimages/curl:8.12.1"),
"platform_quality_probe_job_ttl_sec": _env_int("PLATFORM_QUALITY_PROBE_JOB_TTL_SEC", 1800),
"platform_quality_probe_wait_timeout_sec": _env_float("PLATFORM_QUALITY_PROBE_WAIT_TIMEOUT_SEC", 180.0),
"platform_quality_probe_pushgateway_url": _env(
"PLATFORM_QUALITY_PROBE_PUSHGATEWAY_URL",
"http://platform-quality-gateway.monitoring.svc.cluster.local:9091",
).rstrip("/"),
"platform_quality_probe_http_timeout_sec": _env_int("PLATFORM_QUALITY_PROBE_HTTP_TIMEOUT_SECONDS", 12),
}
@classmethod
def _vaultwarden_config(cls) -> dict[str, Any]:
return {
"vaultwarden_namespace": _env("VAULTWARDEN_NAMESPACE", "vaultwarden"),
"vaultwarden_pod_label": _env("VAULTWARDEN_POD_LABEL", "app=vaultwarden"),
"vaultwarden_pod_port": _env_int("VAULTWARDEN_POD_PORT", 80),
"vaultwarden_service_host": _env(
"VAULTWARDEN_SERVICE_HOST",
"vaultwarden-service.vaultwarden.svc.cluster.local",
),
"vaultwarden_admin_secret_name": _env("VAULTWARDEN_ADMIN_SECRET_NAME", "vaultwarden-admin"),
"vaultwarden_admin_secret_key": _env("VAULTWARDEN_ADMIN_SECRET_KEY", "ADMIN_TOKEN"),
"vaultwarden_admin_session_ttl_sec": _env_float("VAULTWARDEN_ADMIN_SESSION_TTL_SEC", 300.0),
"vaultwarden_admin_rate_limit_backoff_sec": _env_float("VAULTWARDEN_ADMIN_RATE_LIMIT_BACKOFF_SEC", 600.0),
"vaultwarden_retry_cooldown_sec": _env_float("VAULTWARDEN_RETRY_COOLDOWN_SEC", 1800.0),
"vaultwarden_failure_bailout": _env_int("VAULTWARDEN_FAILURE_BAILOUT", 2),
"vaultwarden_invite_refresh_sec": _env_float("VAULTWARDEN_INVITE_REFRESH_SEC", 86400.0),
}
@classmethod
def _schedule_config(cls) -> dict[str, Any]:
return {
"mailu_sync_cron": _env("ARIADNE_SCHEDULE_MAILU_SYNC", "30 4 * * *"),
"nextcloud_sync_cron": _env("ARIADNE_SCHEDULE_NEXTCLOUD_SYNC", "0 5 * * *"),
"nextcloud_cron": _env("ARIADNE_SCHEDULE_NEXTCLOUD_CRON", "*/5 * * * *"),
"nextcloud_maintenance_cron": _env("ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE", "30 4 * * *"),
"vaultwarden_sync_cron": _env("ARIADNE_SCHEDULE_VAULTWARDEN_SYNC", "0 * * * *"),
"wger_user_sync_cron": _env("ARIADNE_SCHEDULE_WGER_USER_SYNC", "0 5 * * *"),
"wger_admin_cron": _env("ARIADNE_SCHEDULE_WGER_ADMIN", "15 3 * * *"),
"firefly_user_sync_cron": _env("ARIADNE_SCHEDULE_FIREFLY_USER_SYNC", "0 6 * * *"),
"firefly_cron": _env("ARIADNE_SCHEDULE_FIREFLY_CRON", "0 3 * * *"),
"pod_cleaner_cron": _env("ARIADNE_SCHEDULE_POD_CLEANER", "0 * * * *"),
"opensearch_prune_cron": _env("ARIADNE_SCHEDULE_OPENSEARCH_PRUNE", "23 3 * * *"),
"image_sweeper_cron": _env("ARIADNE_SCHEDULE_IMAGE_SWEEPER", "30 4 * * 0"),
"vault_k8s_auth_cron": _env("ARIADNE_SCHEDULE_VAULT_K8S_AUTH", "0 * * * *"),
"vault_oidc_cron": _env("ARIADNE_SCHEDULE_VAULT_OIDC", "0 * * * *"),
"comms_guest_name_cron": _env("ARIADNE_SCHEDULE_COMMS_GUEST_NAME", "*/5 * * * *"),
"comms_pin_invite_cron": _env("ARIADNE_SCHEDULE_COMMS_PIN_INVITE", "*/30 * * * *"),
"comms_reset_room_cron": _env("ARIADNE_SCHEDULE_COMMS_RESET_ROOM", "0 0 1 1 *"),
"comms_seed_room_cron": _env("ARIADNE_SCHEDULE_COMMS_SEED_ROOM", "*/10 * * * *"),
"keycloak_profile_cron": _env("ARIADNE_SCHEDULE_KEYCLOAK_PROFILE", "0 */6 * * *"),
"metis_k3s_token_sync_cron": _env("ARIADNE_SCHEDULE_METIS_K3S_TOKEN_SYNC", "11 */6 * * *"),
"platform_quality_suite_probe_cron": _env(
"ARIADNE_SCHEDULE_PLATFORM_QUALITY_SUITE_PROBE",
"*/15 * * * *",
),
}
@classmethod
def _cluster_state_config(cls) -> dict[str, Any]:
return {
"vm_url": _env(
"ARIADNE_VM_URL",
"http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428",
).rstrip("/"),
"cluster_state_vm_timeout_sec": _env_float("ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC", 5.0),
"alertmanager_url": _env("ARIADNE_ALERTMANAGER_URL", "").rstrip("/"),
"cluster_state_cron": _env("ARIADNE_SCHEDULE_CLUSTER_STATE", "*/15 * * * *"),
"cluster_state_keep": _env_int("ARIADNE_CLUSTER_STATE_KEEP", 168),
}
@classmethod
def _metis_config(cls) -> dict[str, Any]:
return {
"metis_base_url": _env("METIS_BASE_URL", "http://metis.maintenance.svc.cluster.local").rstrip("/"),
"metis_watch_url": _env("METIS_WATCH_URL", "").rstrip("/"),
"metis_timeout_sec": _env_float("METIS_TIMEOUT_SEC", 10.0),
"metis_sentinel_watch_cron": _env("ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH", "*/15 * * * *"),
"metis_token_sync_namespace": _env("METIS_TOKEN_SYNC_NAMESPACE", "maintenance"),
"metis_token_sync_service_account": _env("METIS_TOKEN_SYNC_SERVICE_ACCOUNT", "metis-token-sync"),
"metis_token_sync_node_name": _env("METIS_TOKEN_SYNC_NODE_NAME", "titan-0a"),
"metis_token_sync_image": _env("METIS_TOKEN_SYNC_IMAGE", "hashicorp/vault:1.17.6"),
"metis_token_sync_job_ttl_sec": _env_int("METIS_TOKEN_SYNC_JOB_TTL_SEC", 1800),
"metis_token_sync_wait_timeout_sec": _env_float("METIS_TOKEN_SYNC_WAIT_TIMEOUT_SEC", 180.0),
"metis_token_sync_vault_addr": _env(
"METIS_TOKEN_SYNC_VAULT_ADDR",
"http://vault.vault.svc.cluster.local:8200",
).rstrip("/"),
"metis_token_sync_vault_k8s_role": _env("METIS_TOKEN_SYNC_VAULT_K8S_ROLE", "maintenance-metis-token-sync"),
}
@classmethod
def _opensearch_config(cls) -> dict[str, Any]:
return {
"opensearch_url": _env(
"OPENSEARCH_URL",
"http://opensearch-master.logging.svc.cluster.local:9200",
).rstrip("/"),
"opensearch_limit_bytes": _env_int("OPENSEARCH_LIMIT_BYTES", 1024**4),
"opensearch_index_patterns": _env("OPENSEARCH_INDEX_PATTERNS", "kube-*,journald-*"),
"opensearch_timeout_sec": _env_float("OPENSEARCH_TIMEOUT_SEC", 30.0),
}
@classmethod
def from_env(cls) -> "Settings":
keycloak_cfg = cls._keycloak_config()
portal_cfg = cls._portal_group_config()
mailu_cfg = cls._mailu_config()
smtp_cfg = cls._smtp_config(mailu_cfg["mailu_domain"])
nextcloud_cfg = cls._nextcloud_config()
wger_cfg = cls._wger_config()
firefly_cfg = cls._firefly_config()
vault_cfg = cls._vault_config()
comms_cfg = cls._comms_config()
image_cfg = cls._image_sweeper_config()
platform_quality_probe_cfg = cls._platform_quality_probe_config()
vaultwarden_cfg = cls._vaultwarden_config()
schedule_cfg = cls._schedule_config()
cluster_cfg = cls._cluster_state_config()
metis_cfg = cls._metis_config()
opensearch_cfg = cls._opensearch_config()
keycloak_cfg = _keycloak_config()
portal_cfg = _portal_group_config()
mailu_cfg = _mailu_config()
smtp_cfg = _smtp_config(mailu_cfg["mailu_domain"])
nextcloud_cfg = _nextcloud_config()
wger_cfg = _wger_config()
firefly_cfg = _firefly_config()
vault_cfg = _vault_config()
comms_cfg = _comms_config()
image_cfg = _image_sweeper_config()
platform_quality_probe_cfg = _platform_quality_probe_config()
jenkins_build_weather_cfg = _jenkins_build_weather_config()
jenkins_workspace_cleanup_cfg = _jenkins_workspace_cleanup_config()
vaultwarden_cfg = _vaultwarden_config()
schedule_cfg = _schedule_config()
cluster_cfg = _cluster_state_config()
metis_cfg = _metis_config()
opensearch_cfg = _opensearch_config()
portal_db = _env("PORTAL_DATABASE_URL", "")
ariadne_db = _env("ARIADNE_DATABASE_URL", portal_db)
@ -605,6 +303,8 @@ class Settings:
**comms_cfg,
**image_cfg,
**platform_quality_probe_cfg,
**jenkins_build_weather_cfg,
**jenkins_workspace_cleanup_cfg,
**vaultwarden_cfg,
**schedule_cfg,
**cluster_cfg,

28
ariadne/settings_env.py Normal file
View File

@ -0,0 +1,28 @@
from __future__ import annotations
import os
def _env(name: str, default: str = "") -> str:
value = os.getenv(name, default)
return value.strip() if isinstance(value, str) else default
def _env_bool(name: str, default: str = "false") -> bool:
return _env(name, default).lower() in {"1", "true", "yes", "y", "on"}
def _env_int(name: str, default: int) -> int:
raw = _env(name, str(default))
try:
return int(raw)
except ValueError:
return default
def _env_float(name: str, default: float) -> float:
raw = _env(name, str(default))
try:
return float(raw)
except ValueError:
return default

View File

@ -0,0 +1,343 @@
from __future__ import annotations
from typing import Any
from .settings_env import _env, _env_bool, _env_float, _env_int
def _keycloak_config() -> dict[str, Any]:
keycloak_url = _env("KEYCLOAK_URL", "https://sso.bstein.dev").rstrip("/")
keycloak_realm = _env("KEYCLOAK_REALM", "atlas")
keycloak_client_id = _env("KEYCLOAK_CLIENT_ID", "bstein-dev-home")
keycloak_issuer = _env("KEYCLOAK_ISSUER", f"{keycloak_url}/realms/{keycloak_realm}").rstrip("/")
keycloak_jwks_url = _env("KEYCLOAK_JWKS_URL", f"{keycloak_issuer}/protocol/openid-connect/certs").rstrip("/")
return {
"keycloak_url": keycloak_url,
"keycloak_realm": keycloak_realm,
"keycloak_client_id": keycloak_client_id,
"keycloak_issuer": keycloak_issuer,
"keycloak_jwks_url": keycloak_jwks_url,
"keycloak_admin_url": _env("KEYCLOAK_ADMIN_URL", keycloak_url).rstrip("/"),
"keycloak_admin_realm": _env("KEYCLOAK_ADMIN_REALM", keycloak_realm),
"keycloak_admin_client_id": _env("KEYCLOAK_ADMIN_CLIENT_ID", ""),
"keycloak_admin_client_secret": _env("KEYCLOAK_ADMIN_CLIENT_SECRET", ""),
}
def _portal_group_config() -> dict[str, Any]:
return {
"portal_admin_users": [u for u in (_env("PORTAL_ADMIN_USERS", "bstein")).split(",") if u.strip()],
"portal_admin_groups": [g for g in (_env("PORTAL_ADMIN_GROUPS", "admin")).split(",") if g.strip()],
"account_allowed_groups": [g for g in (_env("ACCOUNT_ALLOWED_GROUPS", "dev,admin")).split(",") if g.strip()],
"allowed_flag_groups": [g for g in (_env("ALLOWED_FLAG_GROUPS", "demo,test")).split(",") if g.strip()],
"default_user_groups": [g for g in (_env("DEFAULT_USER_GROUPS", "dev")).split(",") if g.strip()],
}
def _mailu_config() -> dict[str, Any]:
mailu_domain = _env("MAILU_DOMAIN", "bstein.dev")
return {
"mailu_domain": mailu_domain,
"mailu_sync_url": _env(
"MAILU_SYNC_URL",
"http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events",
).rstrip("/"),
"mailu_event_min_interval_sec": _env_float("MAILU_EVENT_MIN_INTERVAL_SEC", 10.0),
"mailu_sync_wait_timeout_sec": _env_float("MAILU_SYNC_WAIT_TIMEOUT_SEC", 60.0),
"mailu_mailbox_wait_timeout_sec": _env_float("MAILU_MAILBOX_WAIT_TIMEOUT_SEC", 60.0),
"mailu_db_host": _env("MAILU_DB_HOST", "postgres-service.postgres.svc.cluster.local"),
"mailu_db_port": _env_int("MAILU_DB_PORT", 5432),
"mailu_db_name": _env("MAILU_DB_NAME", "mailu"),
"mailu_db_user": _env("MAILU_DB_USER", "mailu"),
"mailu_db_password": _env("MAILU_DB_PASSWORD", ""),
"mailu_host": _env("MAILU_HOST", f"mail.{mailu_domain}"),
"mailu_default_quota": _env_int("MAILU_DEFAULT_QUOTA", 20000000000),
"mailu_system_users": [u for u in _env("MAILU_SYSTEM_USERS", "").split(",") if u.strip()],
"mailu_system_password": _env("MAILU_SYSTEM_PASSWORD", ""),
}
def _smtp_config(mailu_domain: str) -> dict[str, Any]:
return {
"smtp_host": _env("SMTP_HOST", ""),
"smtp_port": _env_int("SMTP_PORT", 25),
"smtp_username": _env("SMTP_USERNAME", ""),
"smtp_password": _env("SMTP_PASSWORD", ""),
"smtp_starttls": _env_bool("SMTP_STARTTLS", "false"),
"smtp_use_tls": _env_bool("SMTP_USE_TLS", "false"),
"smtp_from": _env("SMTP_FROM", f"postmaster@{mailu_domain}"),
"smtp_timeout_sec": _env_float("SMTP_TIMEOUT_SEC", 10.0),
"welcome_email_enabled": _env_bool("WELCOME_EMAIL_ENABLED", "true"),
}
def _nextcloud_config() -> dict[str, Any]:
return {
"nextcloud_namespace": _env("NEXTCLOUD_NAMESPACE", "nextcloud"),
"nextcloud_pod_label": _env("NEXTCLOUD_POD_LABEL", "app=nextcloud"),
"nextcloud_container": _env("NEXTCLOUD_CONTAINER", "nextcloud"),
"nextcloud_exec_timeout_sec": _env_float("NEXTCLOUD_EXEC_TIMEOUT_SEC", 120.0),
"nextcloud_db_host": _env("NEXTCLOUD_DB_HOST", "postgres-service.postgres.svc.cluster.local"),
"nextcloud_db_port": _env_int("NEXTCLOUD_DB_PORT", 5432),
"nextcloud_db_name": _env("NEXTCLOUD_DB_NAME", "nextcloud"),
"nextcloud_db_user": _env("NEXTCLOUD_DB_USER", "nextcloud"),
"nextcloud_db_password": _env("NEXTCLOUD_DB_PASSWORD", ""),
"nextcloud_url": _env("NEXTCLOUD_URL", "https://cloud.bstein.dev").rstrip("/"),
"nextcloud_admin_user": _env("NEXTCLOUD_ADMIN_USER", ""),
"nextcloud_admin_password": _env("NEXTCLOUD_ADMIN_PASSWORD", ""),
}
def _wger_config() -> dict[str, Any]:
return {
"wger_namespace": _env("WGER_NAMESPACE", "health"),
"wger_user_sync_wait_timeout_sec": _env_float("WGER_USER_SYNC_WAIT_TIMEOUT_SEC", 60.0),
"wger_pod_label": _env("WGER_POD_LABEL", "app=wger"),
"wger_container": _env("WGER_CONTAINER", "wger"),
"wger_admin_username": _env("WGER_ADMIN_USERNAME", ""),
"wger_admin_password": _env("WGER_ADMIN_PASSWORD", ""),
"wger_admin_email": _env("WGER_ADMIN_EMAIL", ""),
}
def _firefly_config() -> dict[str, Any]:
return {
"firefly_namespace": _env("FIREFLY_NAMESPACE", "finance"),
"firefly_user_sync_wait_timeout_sec": _env_float("FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC", 90.0),
"firefly_pod_label": _env("FIREFLY_POD_LABEL", "app=firefly"),
"firefly_container": _env("FIREFLY_CONTAINER", "firefly"),
"firefly_cron_base_url": _env(
"FIREFLY_CRON_BASE_URL",
"http://firefly.finance.svc.cluster.local/api/v1/cron",
),
"firefly_cron_token": _env("FIREFLY_CRON_TOKEN", ""),
"firefly_cron_timeout_sec": _env_float("FIREFLY_CRON_TIMEOUT_SEC", 30.0),
}
def _vault_config() -> dict[str, Any]:
return {
"vault_namespace": _env("VAULT_NAMESPACE", "vault"),
"vault_addr": _env("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/"),
"vault_token": _env("VAULT_TOKEN", ""),
"vault_k8s_role": _env("VAULT_K8S_ROLE", "vault"),
"vault_k8s_role_ttl": _env("VAULT_K8S_ROLE_TTL", "1h"),
"vault_k8s_token_reviewer_jwt": _env("VAULT_K8S_TOKEN_REVIEWER_JWT", ""),
"vault_k8s_token_reviewer_jwt_file": _env("VAULT_K8S_TOKEN_REVIEWER_JWT_FILE", ""),
"vault_oidc_discovery_url": _env("VAULT_OIDC_DISCOVERY_URL", ""),
"vault_oidc_client_id": _env("VAULT_OIDC_CLIENT_ID", ""),
"vault_oidc_client_secret": _env("VAULT_OIDC_CLIENT_SECRET", ""),
"vault_oidc_default_role": _env("VAULT_OIDC_DEFAULT_ROLE", "admin"),
"vault_oidc_scopes": _env("VAULT_OIDC_SCOPES", "openid profile email groups"),
"vault_oidc_user_claim": _env("VAULT_OIDC_USER_CLAIM", "preferred_username"),
"vault_oidc_groups_claim": _env("VAULT_OIDC_GROUPS_CLAIM", "groups"),
"vault_oidc_token_policies": _env("VAULT_OIDC_TOKEN_POLICIES", ""),
"vault_oidc_admin_group": _env("VAULT_OIDC_ADMIN_GROUP", "admin"),
"vault_oidc_admin_policies": _env("VAULT_OIDC_ADMIN_POLICIES", "default,vault-admin"),
"vault_oidc_dev_group": _env("VAULT_OIDC_DEV_GROUP", "dev"),
"vault_oidc_dev_policies": _env("VAULT_OIDC_DEV_POLICIES", "default,dev-kv"),
"vault_oidc_user_group": _env("VAULT_OIDC_USER_GROUP", ""),
"vault_oidc_user_policies": _env("VAULT_OIDC_USER_POLICIES", ""),
"vault_oidc_redirect_uris": _env(
"VAULT_OIDC_REDIRECT_URIS",
"https://secret.bstein.dev/ui/vault/auth/oidc/oidc/callback",
),
"vault_oidc_bound_audiences": _env("VAULT_OIDC_BOUND_AUDIENCES", ""),
"vault_oidc_bound_claims_type": _env("VAULT_OIDC_BOUND_CLAIMS_TYPE", "string"),
}
def _comms_config() -> dict[str, Any]:
return {
"comms_namespace": _env("COMMS_NAMESPACE", "comms"),
"comms_synapse_base": _env(
"COMMS_SYNAPSE_BASE",
"http://othrys-synapse-matrix-synapse:8008",
).rstrip("/"),
"comms_auth_base": _env(
"COMMS_AUTH_BASE",
"http://matrix-authentication-service:8080",
).rstrip("/"),
"comms_mas_admin_api_base": _env(
"COMMS_MAS_ADMIN_API_BASE",
"http://matrix-authentication-service:8081/api/admin/v1",
).rstrip("/"),
"comms_mas_token_url": _env(
"COMMS_MAS_TOKEN_URL",
"http://matrix-authentication-service:8080/oauth2/token",
),
"comms_mas_admin_client_id": _env("COMMS_MAS_ADMIN_CLIENT_ID", "01KDXMVQBQ5JNY6SEJPZW6Z8BM"),
"comms_mas_admin_client_secret": _env("COMMS_MAS_ADMIN_CLIENT_SECRET", ""),
"comms_server_name": _env("COMMS_SERVER_NAME", "live.bstein.dev"),
"comms_room_alias": _env("COMMS_ROOM_ALIAS", "#othrys:live.bstein.dev"),
"comms_room_name": _env("COMMS_ROOM_NAME", "Othrys"),
"comms_pin_message": _env(
"COMMS_PIN_MESSAGE",
"Invite guests: share https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join and choose 'Continue' -> 'Join as guest'.",
),
"comms_seeder_user": _env("COMMS_SEEDER_USER", "othrys-seeder"),
"comms_seeder_password": _env("COMMS_SEEDER_PASSWORD", ""),
"comms_bot_user": _env("COMMS_BOT_USER", "atlasbot"),
"comms_bot_password": _env("COMMS_BOT_PASSWORD", ""),
"comms_synapse_db_host": _env(
"COMMS_SYNAPSE_DB_HOST",
"postgres-service.postgres.svc.cluster.local",
),
"comms_synapse_db_port": _env_int("COMMS_SYNAPSE_DB_PORT", 5432),
"comms_synapse_db_name": _env("COMMS_SYNAPSE_DB_NAME", "synapse"),
"comms_synapse_db_user": _env("COMMS_SYNAPSE_DB_USER", "synapse"),
"comms_synapse_db_password": _env("COMMS_SYNAPSE_DB_PASSWORD", ""),
"comms_synapse_admin_token": _env("COMMS_SYNAPSE_ADMIN_TOKEN", ""),
"comms_timeout_sec": _env_float("COMMS_TIMEOUT_SEC", 30.0),
"comms_guest_stale_days": _env_int("COMMS_GUEST_STALE_DAYS", 14),
}
def _image_sweeper_config() -> dict[str, Any]:
return {
"image_sweeper_namespace": _env("IMAGE_SWEEPER_NAMESPACE", "maintenance"),
"image_sweeper_service_account": _env("IMAGE_SWEEPER_SERVICE_ACCOUNT", "node-image-sweeper"),
"image_sweeper_job_ttl_sec": _env_int("IMAGE_SWEEPER_JOB_TTL_SEC", 3600),
"image_sweeper_wait_timeout_sec": _env_float("IMAGE_SWEEPER_WAIT_TIMEOUT_SEC", 1200.0),
}
def _platform_quality_probe_config() -> dict[str, Any]:
return {
"platform_quality_probe_namespace": _env("PLATFORM_QUALITY_PROBE_NAMESPACE", "monitoring"),
"platform_quality_probe_script_configmap": _env(
"PLATFORM_QUALITY_PROBE_SCRIPT_CONFIGMAP",
"platform-quality-suite-probe-script",
),
"platform_quality_probe_image": _env("PLATFORM_QUALITY_PROBE_IMAGE", "curlimages/curl:8.12.1"),
"platform_quality_probe_job_ttl_sec": _env_int("PLATFORM_QUALITY_PROBE_JOB_TTL_SEC", 1800),
"platform_quality_probe_wait_timeout_sec": _env_float("PLATFORM_QUALITY_PROBE_WAIT_TIMEOUT_SEC", 180.0),
"platform_quality_probe_pushgateway_url": _env(
"PLATFORM_QUALITY_PROBE_PUSHGATEWAY_URL",
"http://platform-quality-gateway.monitoring.svc.cluster.local:9091",
).rstrip("/"),
"platform_quality_probe_http_timeout_sec": _env_int("PLATFORM_QUALITY_PROBE_HTTP_TIMEOUT_SECONDS", 12),
}
def _jenkins_build_weather_config() -> dict[str, Any]:
return {
"jenkins_base_url": _env("JENKINS_BASE_URL", "https://ci.bstein.dev").rstrip("/"),
"jenkins_api_user": _env("JENKINS_API_USER", ""),
"jenkins_api_token": _env("JENKINS_API_TOKEN", ""),
"jenkins_api_timeout_sec": _env_float("JENKINS_API_TIMEOUT_SEC", 10.0),
}
def _jenkins_workspace_cleanup_config() -> dict[str, Any]:
return {
"jenkins_workspace_namespace": _env("JENKINS_WORKSPACE_NAMESPACE", "jenkins"),
"jenkins_workspace_pvc_prefix": _env("JENKINS_WORKSPACE_PVC_PREFIX", "pvc-workspace-"),
"jenkins_workspace_cleanup_min_age_hours": _env_float("JENKINS_WORKSPACE_CLEANUP_MIN_AGE_HOURS", 12.0),
"jenkins_workspace_cleanup_dry_run": _env_bool("JENKINS_WORKSPACE_CLEANUP_DRY_RUN", "false"),
"jenkins_workspace_cleanup_max_deletions_per_run": _env_int(
"JENKINS_WORKSPACE_CLEANUP_MAX_DELETIONS_PER_RUN",
20,
),
}
def _vaultwarden_config() -> dict[str, Any]:
return {
"vaultwarden_namespace": _env("VAULTWARDEN_NAMESPACE", "vaultwarden"),
"vaultwarden_pod_label": _env("VAULTWARDEN_POD_LABEL", "app=vaultwarden"),
"vaultwarden_pod_port": _env_int("VAULTWARDEN_POD_PORT", 80),
"vaultwarden_service_host": _env(
"VAULTWARDEN_SERVICE_HOST",
"vaultwarden-service.vaultwarden.svc.cluster.local",
),
"vaultwarden_admin_secret_name": _env("VAULTWARDEN_ADMIN_SECRET_NAME", "vaultwarden-admin"),
"vaultwarden_admin_secret_key": _env("VAULTWARDEN_ADMIN_SECRET_KEY", "ADMIN_TOKEN"),
"vaultwarden_admin_session_ttl_sec": _env_float("VAULTWARDEN_ADMIN_SESSION_TTL_SEC", 300.0),
"vaultwarden_admin_rate_limit_backoff_sec": _env_float("VAULTWARDEN_ADMIN_RATE_LIMIT_BACKOFF_SEC", 600.0),
"vaultwarden_retry_cooldown_sec": _env_float("VAULTWARDEN_RETRY_COOLDOWN_SEC", 1800.0),
"vaultwarden_failure_bailout": _env_int("VAULTWARDEN_FAILURE_BAILOUT", 2),
"vaultwarden_invite_refresh_sec": _env_float("VAULTWARDEN_INVITE_REFRESH_SEC", 86400.0),
}
def _schedule_config() -> dict[str, Any]:
return {
"mailu_sync_cron": _env("ARIADNE_SCHEDULE_MAILU_SYNC", "30 4 * * *"),
"nextcloud_sync_cron": _env("ARIADNE_SCHEDULE_NEXTCLOUD_SYNC", "0 5 * * *"),
"nextcloud_cron": _env("ARIADNE_SCHEDULE_NEXTCLOUD_CRON", "*/5 * * * *"),
"nextcloud_maintenance_cron": _env("ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE", "30 4 * * *"),
"vaultwarden_sync_cron": _env("ARIADNE_SCHEDULE_VAULTWARDEN_SYNC", "0 * * * *"),
"wger_user_sync_cron": _env("ARIADNE_SCHEDULE_WGER_USER_SYNC", "0 5 * * *"),
"wger_admin_cron": _env("ARIADNE_SCHEDULE_WGER_ADMIN", "15 3 * * *"),
"firefly_user_sync_cron": _env("ARIADNE_SCHEDULE_FIREFLY_USER_SYNC", "0 6 * * *"),
"firefly_cron": _env("ARIADNE_SCHEDULE_FIREFLY_CRON", "0 3 * * *"),
"pod_cleaner_cron": _env("ARIADNE_SCHEDULE_POD_CLEANER", "0 * * * *"),
"opensearch_prune_cron": _env("ARIADNE_SCHEDULE_OPENSEARCH_PRUNE", "23 3 * * *"),
"image_sweeper_cron": _env("ARIADNE_SCHEDULE_IMAGE_SWEEPER", "30 4 * * 0"),
"vault_k8s_auth_cron": _env("ARIADNE_SCHEDULE_VAULT_K8S_AUTH", "0 * * * *"),
"vault_oidc_cron": _env("ARIADNE_SCHEDULE_VAULT_OIDC", "0 * * * *"),
"comms_guest_name_cron": _env("ARIADNE_SCHEDULE_COMMS_GUEST_NAME", "*/5 * * * *"),
"comms_pin_invite_cron": _env("ARIADNE_SCHEDULE_COMMS_PIN_INVITE", "*/30 * * * *"),
"comms_reset_room_cron": _env("ARIADNE_SCHEDULE_COMMS_RESET_ROOM", "0 0 1 1 *"),
"comms_seed_room_cron": _env("ARIADNE_SCHEDULE_COMMS_SEED_ROOM", "*/10 * * * *"),
"keycloak_profile_cron": _env("ARIADNE_SCHEDULE_KEYCLOAK_PROFILE", "0 */6 * * *"),
"metis_k3s_token_sync_cron": _env("ARIADNE_SCHEDULE_METIS_K3S_TOKEN_SYNC", "11 */6 * * *"),
"platform_quality_suite_probe_cron": _env(
"ARIADNE_SCHEDULE_PLATFORM_QUALITY_SUITE_PROBE",
"*/15 * * * *",
),
"jenkins_build_weather_cron": _env(
"ARIADNE_SCHEDULE_JENKINS_BUILD_WEATHER",
"*/10 * * * *",
),
"jenkins_workspace_cleanup_cron": _env(
"ARIADNE_SCHEDULE_JENKINS_WORKSPACE_CLEANUP",
"45 */6 * * *",
),
}
def _cluster_state_config() -> dict[str, Any]:
return {
"vm_url": _env(
"ARIADNE_VM_URL",
"http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428",
).rstrip("/"),
"cluster_state_vm_timeout_sec": _env_float("ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC", 5.0),
"alertmanager_url": _env("ARIADNE_ALERTMANAGER_URL", "").rstrip("/"),
"cluster_state_cron": _env("ARIADNE_SCHEDULE_CLUSTER_STATE", "*/15 * * * *"),
"cluster_state_keep": _env_int("ARIADNE_CLUSTER_STATE_KEEP", 168),
}
def _metis_config() -> dict[str, Any]:
return {
"metis_base_url": _env("METIS_BASE_URL", "http://metis.maintenance.svc.cluster.local").rstrip("/"),
"metis_watch_url": _env("METIS_WATCH_URL", "").rstrip("/"),
"metis_timeout_sec": _env_float("METIS_TIMEOUT_SEC", 10.0),
"metis_sentinel_watch_cron": _env("ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH", "*/15 * * * *"),
"metis_token_sync_namespace": _env("METIS_TOKEN_SYNC_NAMESPACE", "maintenance"),
"metis_token_sync_service_account": _env("METIS_TOKEN_SYNC_SERVICE_ACCOUNT", "metis-token-sync"),
"metis_token_sync_node_name": _env("METIS_TOKEN_SYNC_NODE_NAME", "titan-0a"),
"metis_token_sync_image": _env("METIS_TOKEN_SYNC_IMAGE", "hashicorp/vault:1.17.6"),
"metis_token_sync_job_ttl_sec": _env_int("METIS_TOKEN_SYNC_JOB_TTL_SEC", 1800),
"metis_token_sync_wait_timeout_sec": _env_float("METIS_TOKEN_SYNC_WAIT_TIMEOUT_SEC", 180.0),
"metis_token_sync_vault_addr": _env(
"METIS_TOKEN_SYNC_VAULT_ADDR",
"http://vault.vault.svc.cluster.local:8200",
).rstrip("/"),
"metis_token_sync_vault_k8s_role": _env("METIS_TOKEN_SYNC_VAULT_K8S_ROLE", "maintenance-metis-token-sync"),
}
def _opensearch_config() -> dict[str, Any]:
return {
"opensearch_url": _env(
"OPENSEARCH_URL",
"http://opensearch-master.logging.svc.cluster.local:9200",
).rstrip("/"),
"opensearch_limit_bytes": _env_int("OPENSEARCH_LIMIT_BYTES", 1024**4),
"opensearch_index_patterns": _env("OPENSEARCH_INDEX_PATTERNS", "kube-*,journald-*"),
"opensearch_timeout_sec": _env_float("OPENSEARCH_TIMEOUT_SEC", 30.0),
}

View File

@ -39,6 +39,8 @@ def _http_error_detail(exc: httpx.HTTPStatusError) -> str:
def safe_error_detail(exc: Exception, fallback: str) -> str:
"""Return a user-safe error message without leaking noisy exception internals."""
runtime_detail = _runtime_error_detail(exc)
if runtime_detail:
return runtime_detail

View File

@ -7,6 +7,8 @@ _BEARER_PARTS = 2
def extract_bearer_token(request: Request) -> str | None:
"""Extract a Bearer token from a FastAPI request if one is present."""
header = request.headers.get("Authorization", "")
if not header:
return None

View File

@ -42,6 +42,8 @@ class LogConfig:
class JsonFormatter(logging.Formatter):
"""Format log records as structured JSON with Ariadne task context."""
def format(self, record: logging.LogRecord) -> str:
payload: dict[str, Any] = {
"timestamp": datetime.fromtimestamp(record.created, tz=timezone.utc).isoformat(),
@ -87,6 +89,8 @@ class _ContextFilter(logging.Filter):
def configure_logging(config: LogConfig | None = None) -> None:
"""Configure process-wide JSON logging once for Ariadne services."""
global _LOGGING_CONFIGURED
if _LOGGING_CONFIGURED:
return
@ -109,11 +113,15 @@ def configure_logging(config: LogConfig | None = None) -> None:
def get_logger(name: str) -> logging.Logger:
"""Return a named logger using the shared Ariadne logging configuration."""
return logging.getLogger(name)
@contextmanager
def task_context(name: str | None) -> Any:
"""Attach a task name to log records emitted inside the context."""
token = _TASK_NAME.set(name)
try:
yield

View File

@ -5,5 +5,7 @@ import string
def random_password(length: int = 32) -> str:
"""Generate a random alphanumeric password with the requested length."""
alphabet = string.ascii_letters + string.digits
return "".join(secrets.choice(alphabet) for _ in range(length))

View File

@ -0,0 +1 @@
# path reason
1 # path reason

View File

@ -2,7 +2,7 @@ fastapi==0.115.11
uvicorn[standard]==0.30.6
httpx==0.27.2
kubernetes==30.1.0
PyJWT[crypto]==2.10.1
PyJWT[crypto]==2.12.1
psycopg[binary]==3.2.6
psycopg-pool==3.2.6
croniter==2.0.7

View File

@ -0,0 +1,66 @@
#!/usr/bin/env python3
"""Enforce Ariadne's per-file source coverage contract."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
def _source_files(root: Path) -> list[str]:
files: list[str] = []
for path in sorted(root.rglob("*.py")):
if "__pycache__" in path.parts:
continue
files.append(path.as_posix())
return files
def _coverage_percent(file_payload: object) -> float | None:
if not isinstance(file_payload, dict):
return None
summary = file_payload.get("summary")
if not isinstance(summary, dict):
return None
value = summary.get("percent_covered")
if isinstance(value, (int, float)):
return float(value)
return None
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("coverage_json")
parser.add_argument("--source-root", default="ariadne")
parser.add_argument("--threshold", type=float, default=95.0)
args = parser.parse_args()
coverage_path = Path(args.coverage_json)
source_root = Path(args.source_root)
payload = json.loads(coverage_path.read_text(encoding="utf-8"))
files = payload.get("files") if isinstance(payload, dict) else None
if not isinstance(files, dict):
print(f"{coverage_path}: missing files coverage map")
return 1
failures: list[str] = []
for source_file in _source_files(source_root):
percent = _coverage_percent(files.get(source_file))
if percent is None:
failures.append(f"{source_file}: missing from coverage report")
elif percent < args.threshold:
failures.append(f"{source_file}: {percent:.2f}% below {args.threshold:.2f}%")
if failures:
print("coverage contract failed:")
for failure in failures:
print(f" - {failure}")
return 1
print(f"coverage contract passed: {len(_source_files(source_root))} files >= {args.threshold:.2f}%")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -0,0 +1,91 @@
#!/usr/bin/env python3
"""Require docstrings on public production APIs."""
from __future__ import annotations
import argparse
import ast
from pathlib import Path
def _is_dataclass_class(node: ast.ClassDef) -> bool:
"""Return whether a class uses the dataclass decorator."""
return any(
(isinstance(dec, ast.Name) and dec.id == "dataclass")
or (isinstance(dec, ast.Call) and isinstance(dec.func, ast.Name) and dec.func.id == "dataclass")
for dec in node.decorator_list
)
def _base_names(node: ast.ClassDef) -> set[str]:
"""Return simple base class names used by a class definition."""
return {base.id for base in node.bases if isinstance(base, ast.Name)}
def _needs_function_docstring(node: ast.FunctionDef | ast.AsyncFunctionDef, parent_class: str | None) -> bool:
"""Return whether a public function-like node needs a docstring."""
if node.name.startswith("_") and node.name != "__init__":
return False
return not (parent_class and node.name.startswith("_"))
def _needs_class_docstring(node: ast.ClassDef) -> bool:
"""Return whether a public class-like node needs a docstring."""
bases = _base_names(node)
skipped_bases = {"Exception", "RuntimeError", "BaseException", "BaseModel"}
return not (node.name.startswith("_") or _is_dataclass_class(node) or bool(bases.intersection(skipped_bases)))
def _needs_docstring(node: ast.AST, *, parent_class: str | None = None) -> bool:
"""Return whether `node` should carry an API contract docstring."""
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
return _needs_function_docstring(node, parent_class)
if isinstance(node, ast.ClassDef):
return _needs_class_docstring(node)
return False
def _iter_nodes(tree: ast.AST) -> list[tuple[ast.AST, str | None]]:
"""Yield top-level surface area nodes for contract checking."""
return [(node, None) for node in getattr(tree, "body", [])]
def main() -> int:
"""Scan the production package and fail on missing docstrings."""
parser = argparse.ArgumentParser()
parser.add_argument("--root", default="ariadne")
args = parser.parse_args()
root = Path(args.root)
violations: list[str] = []
for path in sorted(root.rglob("*.py")):
if "__pycache__" in path.parts or ".venv" in path.parts:
continue
tree = ast.parse(path.read_text(encoding="utf-8"))
for node, parent_class in _iter_nodes(tree):
if not _needs_docstring(node, parent_class=parent_class):
continue
if ast.get_docstring(node):
continue
if isinstance(node, ast.ClassDef):
violations.append(f"{path}: class {node.name} is missing a docstring")
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
owner = f"{parent_class}." if parent_class else ""
violations.append(f"{path}: {owner}{node.name} is missing a docstring")
if violations:
for item in violations:
print(item)
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -1,10 +1,5 @@
#!/usr/bin/env python3
"""Enforce a ratcheted source file line-budget contract.
The check fails when:
- a file exceeds the configured line budget and is not allowlisted; or
- an allowlist entry is stale (file removed or now within budget).
"""
"""Fail when source files exceed a configured line-count threshold."""
from __future__ import annotations
@ -12,73 +7,77 @@ import argparse
from pathlib import Path
def _iter_source_files(roots: list[str], exts: set[str]) -> list[Path]:
files: list[Path] = []
for root_text in roots:
root = Path(root_text)
if not root.exists():
continue
for path in root.rglob("*"):
if not path.is_file():
continue
if path.suffix not in exts:
continue
if "__pycache__" in path.parts or ".venv" in path.parts:
continue
files.append(path.resolve())
return sorted(files)
DEFAULT_SKIP_PARTS = {
".git",
".venv",
"venv",
"build",
"dist",
"node_modules",
"__pycache__",
".pytest_cache",
}
SOURCE_SUFFIXES = {".py", ".sh", ".json", ".yaml", ".yml"}
def _load_waivers(path: Path) -> dict[str, str]:
waivers: dict[str, str] = {}
def _read_waivers(path: Path) -> set[str]:
if not path.exists():
return waivers
for raw_line in path.read_text(encoding="utf-8").splitlines():
line = raw_line.strip()
if not line or line.startswith("#"):
return set()
waived: set[str] = set()
for line in path.read_text(encoding="utf-8").splitlines():
row = line.strip()
if not row or row.startswith("#"):
continue
parts = line.split("\t")
rel_path = parts[0].strip()
reason = parts[1].strip() if len(parts) > 1 else ""
if rel_path:
waivers[rel_path] = reason
return waivers
waived.add(row.split("\t", 1)[0].strip())
return waived
def _iter_files(root: Path) -> list[Path]:
if not root.exists():
return []
files: list[Path] = []
for path in root.rglob("*"):
if not path.is_file():
continue
if any(part in DEFAULT_SKIP_PARTS for part in path.parts):
continue
if path.suffix.lower() not in SOURCE_SUFFIXES and path.name != "Jenkinsfile":
continue
files.append(path)
return files
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--roots", nargs="+", default=["ariadne", "scripts", "tests"])
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--roots", nargs="+", required=True)
parser.add_argument("--max-lines", type=int, default=500)
parser.add_argument("--waivers", default="scripts/loc_hygiene_waivers.tsv")
parser.add_argument("--waivers", default="ci/loc_hygiene_waivers.tsv")
args = parser.parse_args()
repo_root = Path.cwd().resolve()
waivers = _load_waivers(repo_root / args.waivers)
source_files = _iter_source_files(args.roots, {".py", ".sh"})
waived = _read_waivers(repo_root / args.waivers)
violations: dict[str, int] = {}
for path in source_files:
rel = path.relative_to(repo_root).as_posix()
lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
if lines > args.max_lines:
violations[rel] = lines
offenders: list[tuple[int, str]] = []
for root_name in args.roots:
for path in _iter_files(repo_root / root_name):
rel = path.relative_to(repo_root).as_posix()
if rel in waived:
continue
try:
line_count = sum(1 for _ in path.open("r", encoding="utf-8", errors="ignore"))
except OSError:
continue
if line_count > args.max_lines:
offenders.append((line_count, rel))
unexpected = sorted(rel for rel in violations if rel not in waivers)
stale = sorted(rel for rel in waivers if rel not in violations)
if not unexpected and not stale:
print(
f"[hygiene] source line budget check passed (limit={args.max_lines}, over_limit={len(violations)}, waivers={len(waivers)})"
)
if not offenders:
print(f"[loc] ok: no files exceed {args.max_lines} lines")
return 0
if unexpected:
print("[hygiene] files over budget missing from waiver list:")
for rel in unexpected:
print(f"- {rel}: {violations[rel]} lines (limit {args.max_lines})")
if stale:
print("[hygiene] stale waiver entries (remove from waiver list):")
for rel in stale:
print(f"- {rel}")
offenders.sort(reverse=True)
print(f"[loc] failed: {len(offenders)} file(s) exceed {args.max_lines} lines")
for lines, rel in offenders:
print(f" - {rel}: {lines} lines")
return 1

View File

@ -1,14 +0,0 @@
# relative_path<TAB>why_it_is_allowlisted_for_now
ariadne/app.py core application router/orchestration pending decomposition
ariadne/manager/provisioning.py provisioning workflow hub pending modular extraction
ariadne/services/cluster_state.py legacy cluster-state monolith pending split (tracked by branch scope)
ariadne/services/comms.py legacy comms monolith pending split by concern
ariadne/services/firefly.py firefly integration handlers pending endpoint split
ariadne/services/nextcloud.py nextcloud integration surface pending staged decomposition
ariadne/services/vault.py vault integration flow pending dedicated auth/storage modules
ariadne/services/wger.py wger integration flow pending endpoint-layer split
ariadne/settings.py configuration map pending domain-specific config modules
tests/test_app.py broad integration assertions pending test-suite decomposition
tests/test_keycloak_admin.py keycloak contract tests pending helper extraction
tests/test_provisioning.py provisioning matrix tests pending split by workflow phase
tests/test_services.py service integration matrix pending split by service domain
Can't render this file because it has a wrong number of fields in line 2.

View File

@ -10,8 +10,12 @@ import sys
import urllib.request
import xml.etree.ElementTree as ET
SOURCE_SCAN_ROOTS = ("ariadne", "scripts", "tests")
HTTP_BAD_REQUEST = 400
MIN_METRIC_FIELDS = 2
SOURCE_SCAN_ROOTS = ("ariadne", "scripts", "testing")
SOURCE_EXTENSIONS = {".py", ".sh"}
QUALITY_SUCCESS_STATES = {"ok", "pass", "passed", "success", "compliant"}
COVERAGE_GATE_TARGET_PERCENT = 95.0
def _escape_label(value: str) -> str:
@ -61,6 +65,37 @@ def _load_junit(path: str) -> dict[str, int]:
return totals
def _load_junit_cases(path: str) -> list[tuple[str, str]]:
tree = ET.parse(path)
root = tree.getroot()
suites: list[ET.Element]
if root.tag == "testsuite":
suites = [root]
elif root.tag == "testsuites":
suites = list(root.findall("testsuite"))
else:
suites = []
cases: list[tuple[str, str]] = []
for suite in suites:
for case in suite.findall("testcase"):
name = (case.attrib.get("name") or "").strip()
classname = (case.attrib.get("classname") or "").strip()
if not name:
continue
test_id = f"{classname}::{name}" if classname else name
status = "passed"
if case.find("failure") is not None:
status = "failed"
elif case.find("error") is not None:
status = "error"
elif case.find("skipped") is not None:
status = "skipped"
cases.append((test_id, status))
return cases
def _read_http(url: str) -> str:
try:
with urllib.request.urlopen(url, timeout=10) as resp:
@ -73,11 +108,11 @@ def _post_text(url: str, payload: str) -> None:
req = urllib.request.Request(
url,
data=payload.encode("utf-8"),
method="POST",
method="PUT",
headers={"Content-Type": "text/plain"},
)
with urllib.request.urlopen(req, timeout=10) as resp:
if resp.status >= 400:
if resp.status >= HTTP_BAD_REQUEST:
raise RuntimeError(f"metrics push failed status={resp.status}")
@ -92,7 +127,7 @@ def _fetch_existing_counter(pushgateway_url: str, metric: str, labels: dict[str,
if any(f'{k}="{v}"' not in line for k, v in labels.items()):
continue
parts = line.split()
if len(parts) < 2:
if len(parts) < MIN_METRIC_FIELDS:
continue
try:
return float(parts[1])
@ -118,31 +153,120 @@ def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int
return count
def _load_gate_rc(path: Path) -> int | None:
if not path.exists():
return None
raw = path.read_text(encoding="utf-8").strip()
if not raw:
return None
try:
return int(raw)
except ValueError:
return None
def _load_json(path: Path) -> dict | None:
if not path.exists():
return None
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception:
return None
return payload if isinstance(payload, dict) else None
def _sonarqube_check_status(build_dir: Path) -> str:
report = _load_json(Path(os.getenv("QUALITY_GATE_SONARQUBE_REPORT", str(build_dir / "sonarqube-quality-gate.json"))))
if not report:
return "not_applicable"
status_candidates = [
report.get("status"),
((report.get("projectStatus") or {}).get("status") if isinstance(report.get("projectStatus"), dict) else None),
((report.get("qualityGate") or {}).get("status") if isinstance(report.get("qualityGate"), dict) else None),
]
for value in status_candidates:
if isinstance(value, str):
return "ok" if value.strip().lower() in QUALITY_SUCCESS_STATES else "failed"
return "failed"
def _supply_chain_check_status(build_dir: Path) -> str:
report = _load_json(Path(os.getenv("QUALITY_GATE_IRONBANK_REPORT", str(build_dir / "ironbank-compliance.json"))))
if not report:
return "not_applicable"
compliant = report.get("compliant")
if isinstance(compliant, bool):
return "ok" if compliant else "failed"
status_candidates = [report.get("status"), report.get("result"), report.get("compliance")]
for value in status_candidates:
if isinstance(value, str):
return "ok" if value.strip().lower() in QUALITY_SUCCESS_STATES else "failed"
return "failed"
def _resolve_artifact_paths(repo_root: Path) -> tuple[Path, Path]:
"""Find coverage and JUnit artifacts even when a test runner uses fallback names."""
coverage_path = Path(os.getenv("COVERAGE_JSON", "build/coverage.json"))
junit_path = Path(os.getenv("JUNIT_XML", "build/junit.xml"))
if not coverage_path.exists():
for candidate in (
repo_root / "build" / "coverage.json",
repo_root / "build" / "coverage-summary.json",
repo_root / "build" / "coverage" / "coverage-summary.json",
):
if candidate.exists():
coverage_path = candidate
break
if not junit_path.exists():
junit_candidates = sorted((repo_root / "build").glob("junit*.xml"))
if junit_candidates:
junit_path = junit_candidates[0]
return coverage_path, junit_path
def main() -> int:
repo_root = Path(__file__).resolve().parents[1]
coverage_path = os.getenv("COVERAGE_JSON", "build/coverage.json")
junit_path = os.getenv("JUNIT_XML", "build/junit.xml")
build_dir = repo_root / "build"
coverage_path, junit_path = _resolve_artifact_paths(repo_root)
pushgateway_url = os.getenv(
"PUSHGATEWAY_URL", "http://platform-quality-gateway.monitoring.svc.cluster.local:9091"
).strip()
suite = os.getenv("SUITE_NAME", "ariadne")
branch = os.getenv("BRANCH_NAME", "")
branch = os.getenv("BRANCH_NAME") or os.getenv("GIT_BRANCH") or "unknown"
if branch.startswith("origin/"):
branch = branch[len("origin/") :]
build_number = os.getenv("BUILD_NUMBER", "")
jenkins_job = os.getenv("JOB_NAME", "ariadne")
commit = os.getenv("GIT_COMMIT", "")
if not os.path.exists(coverage_path):
raise RuntimeError(f"missing coverage file {coverage_path}")
if not os.path.exists(junit_path):
raise RuntimeError(f"missing junit file {junit_path}")
print(f"[metrics] coverage_path={coverage_path} exists={coverage_path.exists()}")
print(f"[metrics] junit_path={junit_path} exists={junit_path.exists()}")
coverage = _load_coverage(coverage_path)
coverage = 0.0
if coverage_path.exists():
coverage = _load_coverage(str(coverage_path))
docs_gate_rc = _load_gate_rc(Path(os.getenv("QUALITY_GATE_DOCS_RC_PATH", str(build_dir / "docs-naming.rc"))))
source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500)
totals = _load_junit(junit_path)
totals = {"tests": 0, "failures": 0, "errors": 0, "skipped": 0}
test_cases: list[tuple[str, str]] = []
if junit_path.exists():
totals = _load_junit(str(junit_path))
test_cases = _load_junit_cases(str(junit_path))
passed = max(totals["tests"] - totals["failures"] - totals["errors"] - totals["skipped"], 0)
outcome = "ok"
if totals["tests"] <= 0 or totals["failures"] > 0 or totals["errors"] > 0:
outcome = "failed"
checks = {
"tests": "ok" if outcome == "ok" else "failed",
"coverage": "ok" if coverage >= COVERAGE_GATE_TARGET_PERCENT else "failed",
"loc": "ok" if source_lines_over_500 == 0 else "failed",
"docs_naming": "ok" if docs_gate_rc == 0 else "failed",
"gate_glue": "ok",
"sonarqube": _sonarqube_check_status(build_dir),
"supply_chain": _supply_chain_check_status(build_dir),
}
job_name = "platform-quality-ci"
ok_count = _fetch_existing_counter(
@ -164,8 +288,15 @@ def main() -> int:
"suite": suite,
"branch": branch,
"build_number": build_number,
"jenkins_job": jenkins_job,
"commit": commit,
}
test_case_base_labels = {
"suite": suite,
"branch": branch,
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job,
}
payload_lines = [
"# TYPE platform_quality_gate_runs_total counter",
f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count:.0f}',
@ -181,9 +312,26 @@ def main() -> int:
f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {coverage:.3f}',
"# TYPE platform_quality_gate_source_lines_over_500_total gauge",
f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
"# TYPE platform_quality_gate_build_info gauge",
f"platform_quality_gate_build_info{_label_str(labels)} 1",
"# TYPE ariadne_quality_gate_checks_total gauge",
"# TYPE platform_quality_gate_test_case_result gauge",
"# TYPE ariadne_quality_gate_build_info gauge",
f"ariadne_quality_gate_build_info{_label_str(labels)} 1",
]
if test_cases:
payload_lines.extend(
f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': test_name, 'status': test_status})} 1"
for test_name, test_status in test_cases
)
else:
payload_lines.append(
f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': '__no_test_cases__', 'status': 'skipped'})} 1"
)
payload_lines.extend(
f'ariadne_quality_gate_checks_total{{suite="{suite}",check="{check_name}",result="{check_status}"}} 1'
for check_name, check_status in checks.items()
)
payload = "\n".join(payload_lines) + "\n"
_post_text(f"{pushgateway_url.rstrip('/')}/metrics/job/{job_name}/suite/{suite}", payload)

0
tests/__init__.py Normal file
View File

File diff suppressed because it is too large Load Diff

View File

@ -20,7 +20,7 @@ def test_keycloak_verify_accepts_matching_audience(monkeypatch) -> None:
kc = KeycloakOIDC("https://jwks", "https://issuer", "portal")
monkeypatch.setattr(kc, "_get_jwks", lambda force=False: {"keys": [{"kid": "test"}]})
monkeypatch.setattr(jwt.algorithms.RSAAlgorithm, "from_jwk", lambda key: "dummy")
monkeypatch.setattr(kc, "_key_from_jwk", lambda key: "dummy")
monkeypatch.setattr(
jwt,
"decode",
@ -36,7 +36,7 @@ def test_keycloak_verify_rejects_wrong_audience(monkeypatch) -> None:
kc = KeycloakOIDC("https://jwks", "https://issuer", "portal")
monkeypatch.setattr(kc, "_get_jwks", lambda force=False: {"keys": [{"kid": "test"}]})
monkeypatch.setattr(jwt.algorithms.RSAAlgorithm, "from_jwk", lambda key: "dummy")
monkeypatch.setattr(kc, "_key_from_jwk", lambda key: "dummy")
monkeypatch.setattr(
jwt,
"decode",
@ -73,7 +73,7 @@ def test_keycloak_verify_refreshes_jwks(monkeypatch) -> None:
return {"keys": [{"kid": "test"}]}
monkeypatch.setattr(kc, "_get_jwks", fake_get_jwks)
monkeypatch.setattr(jwt.algorithms.RSAAlgorithm, "from_jwk", lambda key: "dummy")
monkeypatch.setattr(kc, "_key_from_jwk", lambda key: "dummy")
monkeypatch.setattr(
jwt,
"decode",

View File

@ -98,6 +98,25 @@ def test_migrate_ignores_timeout_errors(monkeypatch) -> None:
db.migrate(lock_id=123)
def test_migrate_stops_when_dict_lock_is_unavailable(monkeypatch) -> None:
class DictLockConn(DummyConn):
def execute(self, query, params=None):
if "pg_try_advisory_lock" in query:
return DummyResult(row={"pg_try_advisory_lock": False})
return super().execute(query, params)
class DictLockPool(DummyPool):
def __init__(self, conninfo=None, min_size=None, max_size=None, kwargs=None):
self.conn = DictLockConn()
monkeypatch.setattr(db_module, "ConnectionPool", DictLockPool)
db = Database("postgresql://user:pass@localhost/db")
db.migrate(lock_id=123)
assert not any("pg_advisory_unlock" in query for query, _params in db._pool.conn.executed)
def test_migrate_handles_lock_on_alter(monkeypatch) -> None:
class LockConn(DummyConn):
def execute(self, query, params=None):
@ -114,6 +133,46 @@ def test_migrate_handles_lock_on_alter(monkeypatch) -> None:
db.migrate(lock_id=123)
def test_migrate_skips_missing_access_request_table(monkeypatch) -> None:
class MissingAccessRequestsConn(DummyConn):
def execute(self, query, params=None):
if "ALTER TABLE access_requests" in query:
self.executed.append((query, params))
raise db_module.psycopg.errors.UndefinedTable()
return super().execute(query, params)
class MissingAccessRequestsPool(DummyPool):
def __init__(self, conninfo=None, min_size=None, max_size=None, kwargs=None):
self.conn = MissingAccessRequestsConn()
monkeypatch.setattr(db_module, "ConnectionPool", MissingAccessRequestsPool)
db = Database("postgresql://user:pass@localhost/db")
db.migrate(lock_id=123, include_ariadne_tables=False)
assert any("ALTER TABLE access_requests" in query for query, _params in db._pool.conn.executed)
def test_migrate_ignores_unlock_failures(monkeypatch) -> None:
class UnlockFailureConn(DummyConn):
def execute(self, query, params=None):
if "pg_advisory_unlock" in query:
self.executed.append((query, params))
raise RuntimeError("unlock connection closed")
return super().execute(query, params)
class UnlockFailurePool(DummyPool):
def __init__(self, conninfo=None, min_size=None, max_size=None, kwargs=None):
self.conn = UnlockFailureConn()
monkeypatch.setattr(db_module, "ConnectionPool", UnlockFailurePool)
db = Database("postgresql://user:pass@localhost/db")
db.migrate(lock_id=123, include_ariadne_tables=False, include_access_requests=False)
assert any("pg_advisory_unlock" in query for query, _params in db._pool.conn.executed)
def test_fetchone_and_fetchall_return_dicts(monkeypatch) -> None:
class RowConn(DummyConn):
def execute(self, query, params=None):

View File

@ -0,0 +1,325 @@
from __future__ import annotations
from datetime import datetime, timezone
import types
import httpx
import pytest
from prometheus_client import REGISTRY
from ariadne.services import jenkins_build_weather as weather_module
class _DummyResponse:
def __init__(self, payload: dict[str, object], status_code: int = 200) -> None:
self._payload = payload
self.status_code = status_code
def raise_for_status(self) -> None:
if self.status_code >= 400:
request = httpx.Request("GET", "https://ci.bstein.dev/api/json")
response = httpx.Response(self.status_code, request=request)
raise httpx.HTTPStatusError("boom", request=request, response=response)
def json(self) -> dict[str, object]:
return self._payload
class _DummyClient:
def __init__(self, payload: dict[str, object]) -> None:
self._payload = payload
self.called = False
def __enter__(self) -> _DummyClient:
return self
def __exit__(self, exc_type, exc, tb) -> bool:
return False
def get(self, url: str, params: dict[str, str] | None = None) -> _DummyResponse:
self.called = True
assert url == "https://ci.bstein.dev/api/json"
assert isinstance(params, dict)
assert "tree" in params
return _DummyResponse(self._payload)
def _metric_value(name: str, labels: dict[str, str] | None = None) -> float | None:
value = REGISTRY.get_sample_value(name, labels or {})
return float(value) if value is not None else None
def _dummy_settings(base_url: str = "https://ci.bstein.dev") -> types.SimpleNamespace:
return types.SimpleNamespace(
jenkins_base_url=base_url,
jenkins_api_user="",
jenkins_api_token="",
jenkins_api_timeout_sec=5.0,
)
def test_collect_jenkins_build_weather_records_metrics(monkeypatch) -> None:
weather_module._JOB_SERIES = set()
monkeypatch.setattr(weather_module, "settings", _dummy_settings())
payload = {
"jobs": [
{
"name": "ariadne",
"url": "https://ci.bstein.dev/job/ariadne/",
"color": "blue",
"healthReport": [{"score": 93}],
"lastBuild": {"result": "SUCCESS", "timestamp": 1713000000000, "duration": 186000},
"lastSuccessfulBuild": {"timestamp": 1713000000000},
"lastFailedBuild": {"timestamp": 1712000000000},
},
{
"name": "titan-iac",
"url": "https://ci.bstein.dev/job/titan-iac/",
"color": "red",
"healthReport": [{"score": 11}],
"lastBuild": {"result": "FAILURE", "timestamp": 1712990000000, "duration": 126000},
"lastSuccessfulBuild": {"timestamp": 1711000000000},
"lastFailedBuild": {"timestamp": 1712990000000},
},
]
}
monkeypatch.setattr(weather_module.httpx, "Client", lambda **_kwargs: _DummyClient(payload))
before = _metric_value("ariadne_jenkins_build_weather_runs_total", {"status": "ok"}) or 0.0
summary = weather_module.collect_jenkins_build_weather()
assert summary.jobs_total == 2
assert summary.success_total == 1
assert summary.failure_total == 1
assert summary.running_total == 0
assert summary.unknown_total == 0
assert (_metric_value("ariadne_jenkins_build_weather_runs_total", {"status": "ok"}) or 0.0) == before + 1
assert _metric_value(
"ariadne_jenkins_build_weather_job_last_status",
{
"job": "ariadne",
"job_url": "https://ci.bstein.dev/job/ariadne/",
"weather_icon": "☀️",
},
) == 1.0
assert _metric_value(
"ariadne_jenkins_build_weather_job_last_status",
{
"job": "titan-iac",
"job_url": "https://ci.bstein.dev/job/titan-iac/",
"weather_icon": "⛈️",
},
) == 0.0
assert _metric_value(
"ariadne_jenkins_build_weather_job_last_duration_seconds",
{
"job": "ariadne",
"job_url": "https://ci.bstein.dev/job/ariadne/",
"weather_icon": "☀️",
},
) == 186.0
def test_collect_jenkins_build_weather_removes_deleted_job_series(monkeypatch) -> None:
weather_module._JOB_SERIES = set()
monkeypatch.setattr(weather_module, "settings", _dummy_settings())
first_payload = {
"jobs": [
{
"name": "ariadne",
"url": "https://ci.bstein.dev/job/ariadne/",
"color": "blue",
"healthReport": [{"score": 90}],
"lastBuild": {"result": "SUCCESS", "timestamp": 1713000000000, "duration": 186000},
"lastSuccessfulBuild": {"timestamp": 1713000000000},
"lastFailedBuild": {"timestamp": 1712000000000},
},
{
"name": "pegasus",
"url": "https://ci.bstein.dev/job/pegasus/",
"color": "yellow",
"healthReport": [{"score": 50}],
"lastBuild": {"result": "FAILURE", "timestamp": 1712980000000, "duration": 120000},
"lastSuccessfulBuild": {"timestamp": 1710000000000},
"lastFailedBuild": {"timestamp": 1712980000000},
},
]
}
second_payload = {
"jobs": [
{
"name": "ariadne",
"url": "https://ci.bstein.dev/job/ariadne/",
"color": "blue",
"healthReport": [{"score": 90}],
"lastBuild": {"result": "SUCCESS", "timestamp": 1713010000000, "duration": 184000},
"lastSuccessfulBuild": {"timestamp": 1713010000000},
"lastFailedBuild": {"timestamp": 1712000000000},
}
]
}
payloads = [first_payload, second_payload]
monkeypatch.setattr(
weather_module.httpx,
"Client",
lambda **_kwargs: _DummyClient(payloads.pop(0)),
)
weather_module.collect_jenkins_build_weather()
weather_module.collect_jenkins_build_weather()
assert _metric_value(
"ariadne_jenkins_build_weather_job_last_status",
{
"job": "pegasus",
"job_url": "https://ci.bstein.dev/job/pegasus/",
"weather_icon": "☁️",
},
) is None
def test_collect_jenkins_build_weather_skips_when_base_url_empty(monkeypatch) -> None:
weather_module._JOB_SERIES = set()
monkeypatch.setattr(weather_module, "settings", _dummy_settings(base_url=""))
before = _metric_value("ariadne_jenkins_build_weather_runs_total", {"status": "skipped"}) or 0.0
summary = weather_module.collect_jenkins_build_weather()
assert summary.jobs_total == 0
assert (_metric_value("ariadne_jenkins_build_weather_runs_total", {"status": "skipped"}) or 0.0) == before + 1
def test_fetch_jobs_flattens_folder_jobs(monkeypatch) -> None:
weather_module._JOB_SERIES = set()
monkeypatch.setattr(weather_module, "settings", _dummy_settings())
payload = {
"jobs": [
{
"name": "folder",
"url": "https://ci.bstein.dev/job/folder/",
"jobs": [
{
"name": "child",
"url": "https://ci.bstein.dev/job/folder/job/child/",
"color": "blue",
"healthReport": [{"score": 100}],
"lastBuild": {"result": "SUCCESS", "timestamp": 1713000000000, "duration": 1000},
"lastSuccessfulBuild": {"timestamp": 1713000000000},
"lastFailedBuild": {"timestamp": 1712000000000},
}
],
}
]
}
monkeypatch.setattr(weather_module.httpx, "Client", lambda **_kwargs: _DummyClient(payload))
jobs = weather_module._fetch_jobs()
assert len(jobs) == 1
assert jobs[0].job == "folder/child"
assert jobs[0].status == "success"
assert jobs[0].last_duration_seconds == 1.0
assert datetime.fromtimestamp(jobs[0].last_run_ts, tz=timezone.utc).year == 2024
def test_weather_helper_edges(monkeypatch) -> None:
assert weather_module._metric_number(True) == 0.0
assert weather_module._metric_number(object()) == 0.0
assert weather_module._millis_to_seconds(0) == 0.0
monkeypatch.setattr(
weather_module,
"settings",
types.SimpleNamespace(jenkins_api_user=" user ", jenkins_api_token=" token "),
)
assert weather_module._jenkins_auth() == ("user", "token")
assert weather_module._jenkins_status({"color": "blue_anime"}) == "running"
assert weather_module._jenkins_status({"color": "green"}) == "success"
assert weather_module._jenkins_status({"color": "yellow"}) == "failure"
assert weather_module._jenkins_status({}) == "unknown"
assert weather_module._health_score({"healthReport": ["bad"]}, "success") == 100.0
assert weather_module._health_score({}, "running") == 60.0
assert weather_module._health_score({}, "failure") == 10.0
assert weather_module._health_score({}, "unknown") == -1.0
assert weather_module._weather_icon(-1) == ""
assert weather_module._weather_icon(60) == ""
assert weather_module._weather_icon(20) == "🌧️"
def test_flatten_parse_and_fetch_edges(monkeypatch) -> None:
flattened = weather_module._flatten_jobs(
[
"bad",
{"name": ""},
{"name": "folder", "jobs": [{"name": "child", "url": "https://ci/job/child/", "lastBuild": {"result": "SUCCESS"}}]},
{"name": "folder-without-build", "jobs": []},
]
)
assert [job["name"] for job in flattened] == ["folder/child", "folder-without-build"]
assert weather_module._parse_job({"name": "missing-url"}) is None
monkeypatch.setattr(weather_module, "settings", _dummy_settings(base_url=""))
assert weather_module._fetch_jobs() == []
captured = {}
class CapturingClient(_DummyClient):
def __init__(self, **kwargs):
captured.update(kwargs)
super().__init__({"jobs": [{"name": "bad"}]})
monkeypatch.setattr(
weather_module,
"settings",
types.SimpleNamespace(
jenkins_base_url="https://ci.bstein.dev/",
jenkins_api_user="user",
jenkins_api_token="token",
jenkins_api_timeout_sec=7.0,
),
)
monkeypatch.setattr(weather_module.httpx, "Client", CapturingClient)
assert weather_module._fetch_jobs() == []
assert captured["auth"] == ("user", "token")
assert captured["timeout"] == 7.0
class NonObjectClient(_DummyClient):
def __init__(self, **kwargs):
super().__init__(["bad"])
monkeypatch.setattr(weather_module.httpx, "Client", NonObjectClient)
with pytest.raises(ValueError, match="non-object"):
weather_module._fetch_jobs()
def test_remove_missing_series_ignores_missing_metric_labels(monkeypatch) -> None:
class MissingMetric:
def remove(self, *labels):
raise KeyError(labels)
weather_module._JOB_SERIES = {("old", "https://ci/job/old/", "☀️")}
monkeypatch.setattr(weather_module, "_JOB_METRICS", (MissingMetric(),))
weather_module._remove_missing_series(set())
assert weather_module._JOB_SERIES == set()
def test_collect_jenkins_build_weather_records_error(monkeypatch) -> None:
monkeypatch.setattr(weather_module, "settings", _dummy_settings())
before = _metric_value("ariadne_jenkins_build_weather_runs_total", {"status": "error"}) or 0.0
monkeypatch.setattr(weather_module, "_fetch_jobs", lambda: (_ for _ in ()).throw(RuntimeError("jenkins down")))
with pytest.raises(RuntimeError, match="jenkins down"):
weather_module.collect_jenkins_build_weather()
assert (_metric_value("ariadne_jenkins_build_weather_runs_total", {"status": "error"}) or 0.0) == before + 1

View File

@ -0,0 +1,388 @@
from __future__ import annotations
from datetime import datetime, timezone
import types
from prometheus_client import REGISTRY
from ariadne.services import jenkins_workspace_cleanup as cleanup_module
def _metric_value(name: str, labels: dict[str, str]) -> float:
value = REGISTRY.get_sample_value(name, labels)
return float(value) if value is not None else 0.0
def _dummy_settings(*, dry_run: bool, max_deletions: int = 20) -> types.SimpleNamespace:
return types.SimpleNamespace(
jenkins_workspace_namespace="jenkins",
jenkins_workspace_pvc_prefix="pvc-workspace-",
jenkins_workspace_cleanup_min_age_hours=1.0,
jenkins_workspace_cleanup_dry_run=dry_run,
jenkins_workspace_cleanup_max_deletions_per_run=max_deletions,
)
def _fake_payloads(now_iso: str, old_iso: str) -> dict[str, dict[str, object]]:
return {
"/api/v1/namespaces/jenkins/pods": {
"items": [
{
"metadata": {
"annotations": {
"jenkins.io/workspace-pvc": "pvc-workspace-annotated-active",
}
},
"spec": {
"volumes": [
{"persistentVolumeClaim": {"claimName": "pvc-workspace-active"}},
]
},
}
]
},
"/api/v1/namespaces/jenkins/persistentvolumeclaims": {
"items": [
{
"metadata": {"name": "pvc-workspace-stale", "creationTimestamp": old_iso},
"status": {"phase": "Lost"},
},
{
"metadata": {"name": "pvc-workspace-active", "creationTimestamp": old_iso},
"status": {"phase": "Bound"},
},
{
"metadata": {"name": "pvc-workspace-annotated-active", "creationTimestamp": old_iso},
"status": {"phase": "Lost"},
},
{
"metadata": {"name": "pvc-workspace-fresh", "creationTimestamp": now_iso},
"status": {"phase": "Lost"},
},
{
"metadata": {
"name": "pvc-workspace-deleting",
"creationTimestamp": old_iso,
"deletionTimestamp": old_iso,
},
"status": {"phase": "Lost"},
},
]
},
"/api/v1/persistentvolumes": {
"items": [
{
"metadata": {"name": "pvc-old", "creationTimestamp": old_iso},
"status": {"phase": "Released"},
"spec": {"claimRef": {"namespace": "jenkins", "name": "pvc-workspace-stale"}},
},
{
"metadata": {"name": "pvc-active", "creationTimestamp": old_iso},
"status": {"phase": "Released"},
"spec": {"claimRef": {"namespace": "jenkins", "name": "pvc-workspace-active"}},
},
{
"metadata": {"name": "pvc-annotated", "creationTimestamp": old_iso},
"status": {"phase": "Released"},
"spec": {"claimRef": {"namespace": "jenkins", "name": "pvc-workspace-annotated-active"}},
},
{
"metadata": {"name": "pvc-fresh", "creationTimestamp": now_iso},
"status": {"phase": "Released"},
"spec": {"claimRef": {"namespace": "jenkins", "name": "pvc-workspace-fresh"}},
},
{
"metadata": {
"name": "pvc-deleting",
"creationTimestamp": old_iso,
"deletionTimestamp": old_iso,
},
"status": {"phase": "Released"},
"spec": {"claimRef": {"namespace": "jenkins", "name": "pvc-workspace-deleting"}},
},
]
},
"/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes": {
"items": [
{"metadata": {"name": "pvc-old", "creationTimestamp": old_iso}},
{
"metadata": {
"name": "pvc-orphan",
"creationTimestamp": old_iso,
"labels": {
"kubernetes.io/created-for/pvc/name": "pvc-workspace-orphan",
"kubernetes.io/created-for/pvc/namespace": "jenkins",
},
}
},
{
"metadata": {
"name": "pvc-attached",
"creationTimestamp": old_iso,
"labels": {
"kubernetes.io/created-for/pvc/name": "pvc-workspace-annotated-active",
"kubernetes.io/created-for/pvc/namespace": "jenkins",
},
},
"status": {"state": "attached", "isAttached": True, "robustness": "healthy"},
"spec": {"frontend": "blockdev"},
},
{
"metadata": {
"name": "pvc-orphan-other-namespace",
"creationTimestamp": old_iso,
"labels": {
"kubernetes.io/created-for/pvc/name": "pvc-workspace-orphan",
"kubernetes.io/created-for/pvc/namespace": "nextcloud",
},
}
},
{
"metadata": {
"name": "pvc-orphan-fresh",
"creationTimestamp": now_iso,
"labels": {
"kubernetes.io/created-for/pvc/name": "pvc-workspace-fresh",
"kubernetes.io/created-for/pvc/namespace": "jenkins",
},
}
},
{
"metadata": {
"name": "pvc-vol-deleting",
"creationTimestamp": old_iso,
"deletionTimestamp": old_iso,
"labels": {
"kubernetes.io/created-for/pvc/name": "pvc-workspace-orphan",
"kubernetes.io/created-for/pvc/namespace": "jenkins",
},
}
},
]
},
}
def test_cleanup_jenkins_workspace_storage_dry_run(monkeypatch) -> None:
monkeypatch.setattr(cleanup_module, "settings", _dummy_settings(dry_run=True))
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
old_iso = "2020-01-01T00:00:00Z"
payloads = _fake_payloads(now_iso, old_iso)
deleted_paths: list[str] = []
def fake_get_json(path: str):
if path in payloads:
return payloads[path]
raise AssertionError(f"unexpected path: {path}")
def fake_delete_json(path: str):
deleted_paths.append(path)
return {"status": "Success"}
before_runs = _metric_value(
"ariadne_jenkins_workspace_cleanup_runs_total",
{"status": "ok", "mode": "dry_run"},
)
before_planned = _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "pvc", "action": "planned", "mode": "dry_run"},
)
monkeypatch.setattr(cleanup_module, "get_json", fake_get_json)
monkeypatch.setattr(cleanup_module, "delete_json", fake_delete_json)
summary = cleanup_module.cleanup_jenkins_workspace_storage()
assert summary.dry_run is True
assert summary.pvcs_planned == 1
assert summary.pvs_planned == 1
assert summary.volumes_planned == 2
assert summary.pvcs_deleted == 0
assert summary.pvs_deleted == 0
assert summary.volumes_deleted == 0
assert summary.failures == 0
assert deleted_paths == []
assert _metric_value(
"ariadne_jenkins_workspace_cleanup_runs_total",
{"status": "ok", "mode": "dry_run"},
) == before_runs + 1
assert _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "pvc", "action": "planned", "mode": "dry_run"},
) == before_planned + 1
def test_cleanup_jenkins_workspace_storage(monkeypatch) -> None:
monkeypatch.setattr(cleanup_module, "settings", _dummy_settings(dry_run=False))
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
old_iso = "2020-01-01T00:00:00Z"
deleted_paths: list[str] = []
payloads = _fake_payloads(now_iso, old_iso)
def fake_get_json(path: str):
if path in payloads:
return payloads[path]
raise AssertionError(f"unexpected path: {path}")
def fake_delete_json(path: str):
deleted_paths.append(path)
return {"status": "Success"}
before_runs = _metric_value(
"ariadne_jenkins_workspace_cleanup_runs_total",
{"status": "ok", "mode": "delete"},
)
before_deleted = _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "longhorn_volume", "action": "deleted", "mode": "delete"},
)
monkeypatch.setattr(cleanup_module, "get_json", fake_get_json)
monkeypatch.setattr(cleanup_module, "delete_json", fake_delete_json)
summary = cleanup_module.cleanup_jenkins_workspace_storage()
assert summary.pvcs_deleted == 1
assert summary.pvs_deleted == 1
assert summary.volumes_deleted == 2
assert summary.failures == 0
assert "/api/v1/namespaces/jenkins/persistentvolumeclaims/pvc-workspace-stale" in deleted_paths
assert "/api/v1/persistentvolumes/pvc-old" in deleted_paths
assert "/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes/pvc-old" in deleted_paths
assert "/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes/pvc-orphan" in deleted_paths
assert "/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes/pvc-orphan-other-namespace" not in deleted_paths
assert "/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes/pvc-attached" not in deleted_paths
assert _metric_value(
"ariadne_jenkins_workspace_cleanup_runs_total",
{"status": "ok", "mode": "delete"},
) == before_runs + 1
assert _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "longhorn_volume", "action": "deleted", "mode": "delete"},
) == before_deleted + 2
def test_cleanup_jenkins_workspace_storage_failure(monkeypatch) -> None:
monkeypatch.setattr(cleanup_module, "settings", _dummy_settings(dry_run=False))
def fake_get_json(path: str):
if path == "/api/v1/namespaces/jenkins/pods":
return {"items": []}
if path == "/api/v1/namespaces/jenkins/persistentvolumeclaims":
return {
"items": [
{
"metadata": {"name": "pvc-workspace-stale", "creationTimestamp": "2020-01-01T00:00:00Z"},
"status": {"phase": "Lost"},
}
]
}
if path == "/api/v1/persistentvolumes":
return {"items": []}
if path == "/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes":
return {"items": []}
raise AssertionError(f"unexpected path: {path}")
def fake_delete_json(_path: str):
raise RuntimeError("boom")
before_failures = _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "cleanup", "action": "failed", "mode": "delete"},
)
monkeypatch.setattr(cleanup_module, "get_json", fake_get_json)
monkeypatch.setattr(cleanup_module, "delete_json", fake_delete_json)
summary = cleanup_module.cleanup_jenkins_workspace_storage()
assert summary.failures == 1
assert summary.pvcs_deleted == 0
assert _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "cleanup", "action": "failed", "mode": "delete"},
) == before_failures + 1
def test_cleanup_jenkins_workspace_storage_uses_longhorn_kubernetes_status(monkeypatch) -> None:
monkeypatch.setattr(cleanup_module, "settings", _dummy_settings(dry_run=False))
deleted_paths: list[str] = []
def fake_get_json(path: str):
if path == "/api/v1/namespaces/jenkins/pods":
return {"items": []}
if path == "/api/v1/namespaces/jenkins/persistentvolumeclaims":
return {"items": []}
if path == "/api/v1/persistentvolumes":
return {"items": []}
if path == "/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes":
return {
"items": [
{
"metadata": {
"name": "pvc-orphan-kstatus",
"creationTimestamp": "2020-01-01T00:00:00Z",
},
"status": {
"state": "detached",
"isAttached": False,
"robustness": "unknown",
"kubernetesStatus": {
"namespace": "jenkins",
"pvcName": "pvc-workspace-kstatus",
"pvName": "pvc-orphan-kstatus",
},
},
"spec": {"frontend": "blockdev"},
}
]
}
raise AssertionError(f"unexpected path: {path}")
def fake_delete_json(path: str):
deleted_paths.append(path)
return {"status": "Success"}
monkeypatch.setattr(cleanup_module, "get_json", fake_get_json)
monkeypatch.setattr(cleanup_module, "delete_json", fake_delete_json)
summary = cleanup_module.cleanup_jenkins_workspace_storage()
assert summary.volumes_planned == 1
assert summary.volumes_deleted == 1
assert summary.failures == 0
assert deleted_paths == ["/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes/pvc-orphan-kstatus"]
def test_cleanup_jenkins_workspace_storage_guard_caps_mass_delete(monkeypatch) -> None:
monkeypatch.setattr(cleanup_module, "settings", _dummy_settings(dry_run=False, max_deletions=1))
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
old_iso = "2020-01-01T00:00:00Z"
payloads = _fake_payloads(now_iso, old_iso)
deleted_paths: list[str] = []
def fake_get_json(path: str):
if path in payloads:
return payloads[path]
raise AssertionError(f"unexpected path: {path}")
def fake_delete_json(path: str):
deleted_paths.append(path)
return {"status": "Success"}
monkeypatch.setattr(cleanup_module, "get_json", fake_get_json)
monkeypatch.setattr(cleanup_module, "delete_json", fake_delete_json)
summary = cleanup_module.cleanup_jenkins_workspace_storage()
assert summary.failures == 0
assert summary.pvcs_planned == 1
assert summary.pvs_planned == 1
assert summary.volumes_planned == 1
assert summary.pvcs_deleted == 1
assert summary.pvs_deleted == 0
assert summary.volumes_deleted == 0
assert summary.skipped == 2
assert deleted_paths == ["/api/v1/namespaces/jenkins/persistentvolumeclaims/pvc-workspace-stale"]

View File

@ -1,5 +1,8 @@
from __future__ import annotations
import builtins
import importlib.util
import sys
import types
import pytest
@ -57,11 +60,25 @@ class HangingStream(DummyStream):
return False
class ReturnCodeStream(DummyStream):
def __init__(self):
super().__init__(stdout="fallback", stderr="", exit_code=0)
self.returncode = 7
def is_open(self) -> bool:
return False
def peek_exit_code(self):
raise AssertionError("closed streams should not read exit code")
def test_build_command_wraps_env() -> None:
cmd = _build_command(["echo", "hello"], {"FOO": "bar"})
assert cmd[0] == "/bin/sh"
assert "export FOO=bar" in cmd[2]
assert _build_command("echo hello", None) == ["/bin/sh", "-c", "echo hello"]
def test_exec_returns_output(monkeypatch) -> None:
monkeypatch.setattr(exec_module, "select_pod", lambda *_args, **_kwargs: PodRef("pod", "ns"))
@ -94,6 +111,17 @@ def test_exec_times_out(monkeypatch) -> None:
executor.exec(["sleep", "10"], timeout_sec=0.0, check=False)
def test_exec_uses_returncode_when_stream_has_no_exit_code(monkeypatch) -> None:
monkeypatch.setattr(exec_module, "select_pod", lambda *_args, **_kwargs: PodRef("pod", "ns"))
monkeypatch.setattr(exec_module, "_ensure_client", lambda: types.SimpleNamespace(connect_get_namespaced_pod_exec=None))
monkeypatch.setattr(exec_module, "stream", lambda *args, **kwargs: ReturnCodeStream())
result = PodExecutor("ns", "app=test", None).exec("echo ok", check=False)
assert result.exit_code == 7
assert result.ok is False
def test_ensure_client_fallback(monkeypatch) -> None:
dummy_api = object()
monkeypatch.setattr(exec_module, "_CORE_API", None)
@ -115,3 +143,39 @@ def test_ensure_client_fallback(monkeypatch) -> None:
monkeypatch.setattr(exec_module, "client", types.SimpleNamespace(CoreV1Api=lambda: dummy_api))
assert exec_module._ensure_client() is dummy_api
def test_ensure_client_cached_and_import_error(monkeypatch) -> None:
cached = object()
monkeypatch.setattr(exec_module, "_IMPORT_ERROR", None)
monkeypatch.setattr(exec_module, "_CORE_API", cached)
assert exec_module._ensure_client() is cached
error = RuntimeError("missing kubernetes")
monkeypatch.setattr(exec_module, "_IMPORT_ERROR", error)
monkeypatch.setattr(exec_module, "_CORE_API", None)
with pytest.raises(RuntimeError, match="kubernetes client missing"):
exec_module._ensure_client()
def test_exec_module_import_error_fallback(monkeypatch) -> None:
real_import = builtins.__import__
def fake_import(name, globals=None, locals=None, fromlist=(), level=0):
if name == "kubernetes" or name.startswith("kubernetes."):
raise RuntimeError("kubernetes unavailable")
return real_import(name, globals, locals, fromlist, level)
module_name = "ariadne.k8s.exec_import_failure_probe"
spec = importlib.util.spec_from_file_location(module_name, exec_module.__file__)
assert spec and spec.loader
module = importlib.util.module_from_spec(spec)
monkeypatch.setattr(builtins, "__import__", fake_import)
monkeypatch.setitem(sys.modules, module_name, module)
spec.loader.exec_module(module)
assert module.client is None
assert module.config is None
assert module.stream is None
assert isinstance(module._IMPORT_ERROR, RuntimeError)

View File

@ -17,6 +17,23 @@ def test_list_pods_encodes_selector(monkeypatch) -> None:
assert "labelSelector=app%3Dnextcloud" in captured["path"]
def test_list_pods_rejects_missing_namespace() -> None:
with pytest.raises(pods_module.PodSelectionError, match="namespace missing"):
pods_module.list_pods(" ", "app=nextcloud")
def test_parse_start_time_handles_empty_invalid_and_naive_values() -> None:
assert pods_module._parse_start_time(None) == 0.0
assert pods_module._parse_start_time("not-a-date") == 0.0
assert pods_module._parse_start_time("2026-01-20T00:00:00") > 0
def test_ready_helper_handles_malformed_conditions() -> None:
assert pods_module._is_ready({"status": {"phase": "Running"}}) is False
assert pods_module._is_ready({"status": {"phase": "Running", "conditions": [None]}}) is False
assert pods_module._is_ready({"status": {"phase": "Running", "conditions": [{"type": "ContainersReady"}]}}) is False
def test_select_pod_picks_ready_latest(monkeypatch) -> None:
payload = {
"items": [
@ -57,3 +74,28 @@ def test_select_pod_ignores_non_ready(monkeypatch) -> None:
with pytest.raises(pods_module.PodSelectionError):
pods_module.select_pod("demo", "app=test")
def test_select_pod_skips_deleting_and_blank_names(monkeypatch) -> None:
payload = {
"items": [
{
"metadata": {"name": "deleting", "deletionTimestamp": "2026-01-20T00:00:00Z"},
"status": {"phase": "Running", "conditions": [{"type": "Ready", "status": "True"}]},
},
{
"metadata": {"name": " "},
"status": {"phase": "Running", "conditions": [{"type": "Ready", "status": "True"}]},
},
{
"metadata": {"name": "ready"},
"status": {"phase": "Running", "nodeName": "titan-1", "conditions": [{"type": "Ready", "status": "True"}]},
},
]
}
monkeypatch.setattr(pods_module, "get_json", lambda *_args, **_kwargs: payload)
pod = pods_module.select_pod("demo", "app=test")
assert pod.name == "ready"
assert pod.node == "titan-1"

View File

@ -1,5 +1,8 @@
from __future__ import annotations
from types import SimpleNamespace
from ariadne.services import mailu_events as mailu_events_module
from ariadne.services.mailu_events import MailuEventRunner
@ -52,3 +55,95 @@ def test_mailu_event_debounce() -> None:
status, payload = events.handle_event({"force": True})
assert status == 202
assert payload["status"] == "accepted"
def test_mailu_event_parses_string_flags_and_context() -> None:
calls = []
def runner(reason: str, force: bool):
calls.append((reason, force))
return "ok", ""
events = MailuEventRunner(
min_interval_sec=0.0,
wait_timeout_sec=0.1,
runner=runner,
thread_factory=_instant_thread_factory,
)
status, payload = events.handle_event({"wait": "yes", "force": "on", "eventType": " UPDATE_PROFILE ", "userId": " u1 "})
assert status == 200
assert payload["status"] == "ok"
assert calls == [("keycloak_event:UPDATE_PROFILE", True)]
def test_mailu_event_defaults_for_missing_payload() -> None:
calls = []
def runner(reason: str, force: bool):
calls.append((reason, force))
return "ok", ""
events = MailuEventRunner(
min_interval_sec=0.0,
wait_timeout_sec=0.1,
runner=runner,
thread_factory=_instant_thread_factory,
)
status, payload = events.handle_event(None)
assert status == 202
assert payload == {"status": "accepted", "triggered": True}
assert calls == [("keycloak_event", False)]
def test_mailu_event_running_skip_and_wait_timeout() -> None:
def parked_thread_factory(target=None, args=(), daemon=None):
class ParkedThread:
def start(self) -> None:
return None
return ParkedThread()
events = MailuEventRunner(
min_interval_sec=0.0,
wait_timeout_sec=0.0,
runner=lambda _reason, _force: ("ok", ""),
thread_factory=parked_thread_factory,
)
status, payload = events.handle_event({"wait": True})
assert status == 200
assert payload == {"status": "running"}
status, payload = events.handle_event({})
assert status == 202
assert payload == {"status": "skipped", "triggered": False}
def test_mailu_event_runner_reports_exceptions() -> None:
def failing_runner(_reason: str, _force: bool):
raise RuntimeError("mailu exploded")
events = MailuEventRunner(
min_interval_sec=0.0,
wait_timeout_sec=0.1,
runner=failing_runner,
thread_factory=_instant_thread_factory,
)
status, payload = events.handle_event({"wait": True})
assert status == 500
assert payload == {"status": "error", "detail": "mailu exploded"}
def test_default_runner_maps_mailu_summary(monkeypatch) -> None:
events = MailuEventRunner(min_interval_sec=0.0, wait_timeout_sec=0.1)
monkeypatch.setattr(mailu_events_module.mailu, "sync", lambda reason, force=False: SimpleNamespace(failures=0, detail="synced"))
assert events._default_runner("test", True) == ("ok", "synced")
monkeypatch.setattr(mailu_events_module.mailu, "sync", lambda reason, force=False: SimpleNamespace(failures=1, detail="failed"))
assert events._default_runner("test", False) == ("error", "failed")

View File

@ -111,3 +111,90 @@ def test_watch_sentinel_handles_http_error(monkeypatch) -> None:
assert summary.status == "error"
assert summary.detail == "upstream fail"
assert summary.result["detail"] == "upstream fail"
def test_normalize_payload_and_ready(monkeypatch) -> None:
monkeypatch.setattr(
"ariadne.services.metis.settings",
SimpleNamespace(metis_base_url="http://metis", metis_watch_url="", metis_timeout_sec=10.0),
)
service = metis_module.MetisService()
assert service.ready() is True
assert metis_module._normalize_payload(None) == {}
assert metis_module._normalize_payload(["watched"]) == {"result": ["watched"]}
monkeypatch.setattr(
"ariadne.services.metis.settings",
SimpleNamespace(metis_base_url="", metis_watch_url="", metis_timeout_sec=10.0),
)
assert service.ready() is False
def test_watch_sentinel_handles_non_json_success(monkeypatch) -> None:
monkeypatch.setattr(
"ariadne.services.metis.settings",
SimpleNamespace(metis_base_url="http://metis", metis_watch_url="", metis_timeout_sec=10.0),
)
client = DummyClient(DummyResponse(payload=ValueError("not json")))
monkeypatch.setattr(metis_module.httpx, "Client", lambda **kwargs: client)
summary = metis_module.MetisService().watch_sentinel()
assert summary.status == "ok"
assert summary.result == {}
def test_watch_sentinel_handles_http_error_without_json(monkeypatch) -> None:
monkeypatch.setattr(
"ariadne.services.metis.settings",
SimpleNamespace(metis_base_url="http://metis", metis_watch_url="", metis_timeout_sec=10.0),
)
client = DummyClient(DummyResponse(status_code=503, payload=ValueError("not json")))
monkeypatch.setattr(metis_module.httpx, "Client", lambda **kwargs: client)
summary = metis_module.MetisService().watch_sentinel()
assert summary.status == "error"
assert summary.detail == "metis watch failed with HTTP 503"
def test_watch_sentinel_handles_client_exception(monkeypatch) -> None:
monkeypatch.setattr(
"ariadne.services.metis.settings",
SimpleNamespace(metis_base_url="http://metis", metis_watch_url="", metis_timeout_sec=10.0),
)
class FailingClient:
def __enter__(self):
raise RuntimeError("network down")
def __exit__(self, exc_type, exc, tb):
return False
monkeypatch.setattr(metis_module.httpx, "Client", lambda **kwargs: FailingClient())
summary = metis_module.MetisService().watch_sentinel()
assert summary.status == "error"
assert summary.detail == "network down"
def test_watch_sentinel_normalizes_message_and_unknown_status(monkeypatch) -> None:
monkeypatch.setattr(
"ariadne.services.metis.settings",
SimpleNamespace(metis_base_url="http://metis", metis_watch_url="", metis_timeout_sec=10.0),
)
client = DummyClient(DummyResponse(payload={"status": "warning", "message": " watched with warnings "}))
monkeypatch.setattr(metis_module.httpx, "Client", lambda **kwargs: client)
summary = metis_module.MetisService().watch_sentinel()
assert summary.status == "ok"
assert summary.detail == "watched with warnings"
client.response = DummyResponse(payload={"status": "paused"})
summary = metis_module.MetisService().watch_sentinel()
assert summary.status == "ok"
assert summary.detail == "metis watch returned paused"

View File

@ -0,0 +1,10 @@
from __future__ import annotations
from ariadne.utils.name_generator import NameGenerator
def test_name_generator_returns_none_after_exhausting_attempts(monkeypatch) -> None:
generator = NameGenerator(max_attempts=2)
monkeypatch.setattr(NameGenerator, "generate", lambda self: "already-used")
assert generator.unique({"already-used"}) is None

View File

@ -5,10 +5,23 @@ import types
import ariadne.services.opensearch_prune as prune_module
def _settings(**overrides):
values = {
"opensearch_url": "http://opensearch",
"opensearch_limit_bytes": 5,
"opensearch_index_patterns": "kube-*",
"opensearch_timeout_sec": 5.0,
}
values.update(overrides)
return types.SimpleNamespace(**values)
def test_parse_size() -> None:
assert prune_module.parse_size("") == 0
assert prune_module.parse_size("1gb") == 1024**3
assert prune_module.parse_size("0") == 0
assert prune_module.parse_size("bad") == 0
assert prune_module.parse_size("1zb") == 0
def test_prune_indices_deletes(monkeypatch) -> None:
@ -58,3 +71,118 @@ def test_prune_indices_deletes(monkeypatch) -> None:
summary = prune_module.prune_indices()
assert summary.deleted == 1
def test_fetch_indices_ignores_missing_pattern(monkeypatch) -> None:
monkeypatch.setattr(prune_module, "settings", _settings())
class DummyResponse:
status_code = prune_module.HTTP_NOT_FOUND
def raise_for_status(self):
raise AssertionError("404 should be handled before raise_for_status")
client = types.SimpleNamespace(get=lambda *_args, **_kwargs: DummyResponse())
assert prune_module._fetch_indices(client, "missing-*") == []
def test_prune_indices_returns_when_no_patterns(monkeypatch) -> None:
monkeypatch.setattr(prune_module, "settings", _settings(opensearch_index_patterns=" , "))
summary = prune_module.prune_indices()
assert summary.detail == "no patterns configured"
assert summary.deleted == 0
def test_prune_indices_continues_after_fetch_failure(monkeypatch) -> None:
monkeypatch.setattr(
prune_module,
"settings",
_settings(opensearch_index_patterns="bad-*,kube-*", opensearch_limit_bytes=100),
)
class DummyResponse:
status_code = 200
def raise_for_status(self):
return None
def json(self):
return [
{"index": ".system", "store.size": "100b", "creation.date": "1"},
{"index": "kube-1", "store.size": "1b", "creation.date": "2"},
]
class DummyClient:
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def get(self, url, params=None):
if "bad-*" in url:
raise RuntimeError("fetch failed")
return DummyResponse()
def delete(self, _url):
raise AssertionError("within-limit result should not delete indices")
monkeypatch.setattr(prune_module.httpx, "Client", lambda *args, **kwargs: DummyClient())
summary = prune_module.prune_indices()
assert summary.detail == "within limit"
assert summary.total_before == 1
def test_prune_indices_logs_delete_failures_and_keeps_pruning(monkeypatch) -> None:
monkeypatch.setattr(prune_module, "settings", _settings(opensearch_limit_bytes=5))
class DummyResponse:
status_code = 200
def __init__(self, payload):
self._payload = payload
def raise_for_status(self):
return None
def json(self):
return self._payload
class DummyClient:
def __init__(self):
self.deleted: list[str] = []
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def get(self, _url, params=None):
return DummyResponse(
[
{"index": "kube-old", "store.size": "10b", "creation.date": "1"},
{"index": "kube-new", "store.size": "10b", "creation.date": "2"},
]
)
def delete(self, url):
self.deleted.append(url)
if url.endswith("/kube-old"):
raise RuntimeError("delete failed")
return DummyResponse({})
dummy = DummyClient()
monkeypatch.setattr(prune_module.httpx, "Client", lambda *args, **kwargs: dummy)
summary = prune_module.prune_indices()
assert summary.deleted == 1
assert summary.total_before == 20
assert summary.total_after == 10
assert dummy.deleted == ["http://opensearch/kube-old", "http://opensearch/kube-new"]

View File

@ -35,3 +35,15 @@ def test_clean_finished_pods_handles_failure(monkeypatch) -> None:
summary = pod_cleaner.clean_finished_pods()
assert summary.failures == 2
def test_clean_finished_pods_skips_missing_identifiers(monkeypatch) -> None:
def fake_get_json(_path: str):
return {"items": [{"metadata": {"namespace": "ns"}}, {"metadata": {"name": "pod"}}, {"metadata": "bad"}]}
monkeypatch.setattr(pod_cleaner, "get_json", fake_get_json)
monkeypatch.setattr(pod_cleaner, "delete_json", lambda _path: None)
summary = pod_cleaner.clean_finished_pods()
assert summary.skipped == 6
assert summary.deleted == 0

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,7 @@ from dataclasses import dataclass
from datetime import datetime, timezone
import time
from ariadne.db.storage import ScheduleState
from ariadne.scheduler.cron import CronScheduler, CronTask
@ -22,6 +23,9 @@ class DummyStorage:
def record_event(self, *args, **kwargs):
self.events.append((args, kwargs))
def list_schedule_states(self):
return []
def test_execute_task_records_failure() -> None:
storage = DummyStorage()
@ -87,6 +91,115 @@ def test_scheduler_start_skips_when_running() -> None:
assert scheduler._thread.started is False
def test_scheduler_start_hydrates_persisted_schedule_metrics(monkeypatch) -> None:
class HydratingStorage(DummyStorage):
def list_schedule_states(self):
finished = datetime(2026, 1, 1, 12, 0, tzinfo=timezone.utc)
return [
ScheduleState(
task_name="nightly",
cron_expr="30 4 * * *",
last_started_at=finished,
last_finished_at=finished,
last_status="ok",
last_error=None,
last_duration_ms=100,
next_run_at=None,
),
ScheduleState(
task_name="unknown",
cron_expr="* * * * *",
last_started_at=finished,
last_finished_at=finished,
last_status="ok",
last_error=None,
last_duration_ms=100,
next_run_at=None,
),
]
recorded = []
monkeypatch.setattr("ariadne.scheduler.cron.record_schedule_state", lambda *args: recorded.append(args))
scheduler = CronScheduler(HydratingStorage(), tick_sec=0.01)
scheduler.add_task("nightly", "30 4 * * *", lambda: None)
scheduler.start()
scheduler.stop()
assert any(item[0] == "nightly" and item[4] is True for item in recorded)
assert not any(item[0] == "unknown" for item in recorded)
def test_scheduler_hydration_ignores_storage_without_state_listing() -> None:
class MinimalStorage:
pass
scheduler = CronScheduler(MinimalStorage(), tick_sec=0.01)
scheduler._hydrate_schedule_metrics()
def test_scheduler_hydration_logs_storage_errors(monkeypatch) -> None:
class BrokenStorage(DummyStorage):
def list_schedule_states(self):
raise RuntimeError("storage offline")
warnings = []
scheduler = CronScheduler(BrokenStorage(), tick_sec=0.01)
monkeypatch.setattr(scheduler._logger, "warning", lambda *args, **kwargs: warnings.append((args, kwargs)))
scheduler._hydrate_schedule_metrics()
assert warnings
assert warnings[0][1]["extra"]["detail"] == "storage offline"
def test_scheduler_hydration_records_error_and_unknown_statuses(monkeypatch) -> None:
finished = datetime(2026, 1, 1, 12, 0, tzinfo=timezone.utc)
class StatusStorage(DummyStorage):
def list_schedule_states(self):
return [
ScheduleState(
task_name="failed-task",
cron_expr="*/5 * * * *",
last_started_at=finished,
last_finished_at=None,
last_status="error",
last_error="boom",
last_duration_ms=100,
next_run_at=None,
),
ScheduleState(
task_name="pending-task",
cron_expr="*/10 * * * *",
last_started_at=finished,
last_finished_at=None,
last_status="running",
last_error=None,
last_duration_ms=100,
next_run_at=None,
),
]
recorded = []
monkeypatch.setattr("ariadne.scheduler.cron.record_schedule_state", lambda *args: recorded.append(args))
scheduler = CronScheduler(StatusStorage(), tick_sec=0.01)
scheduler.add_task("failed-task", "*/5 * * * *", lambda: None)
scheduler.add_task("pending-task", "*/10 * * * *", lambda: None)
scheduler._next_run.pop("pending-task")
scheduler._hydrate_schedule_metrics()
failed = next(item for item in recorded if item[0] == "failed-task")
pending = next(item for item in recorded if item[0] == "pending-task")
assert failed[2] is None
assert failed[4] is False
assert pending[3] is None
assert pending[4] is None
def test_compute_next_handles_naive_timestamp() -> None:
scheduler = CronScheduler(DummyStorage(), tick_sec=0.1)
base = datetime(2024, 1, 1, 12, 0, 0)

File diff suppressed because it is too large Load Diff

View File

@ -25,3 +25,18 @@ def test_from_env_includes_metis_settings(monkeypatch) -> None:
assert cfg.metis_watch_url == "http://metis.example/internal/sentinel/watch"
assert cfg.metis_timeout_sec == 9.5
assert cfg.metis_sentinel_watch_cron == "*/7 * * * *"
def test_from_env_includes_jenkins_weather_settings(monkeypatch) -> None:
monkeypatch.setenv("JENKINS_BASE_URL", "https://ci.bstein.dev/")
monkeypatch.setenv("JENKINS_API_USER", "ariadne")
monkeypatch.setenv("JENKINS_API_TOKEN", "token")
monkeypatch.setenv("JENKINS_API_TIMEOUT_SEC", "8.5")
monkeypatch.setenv("ARIADNE_SCHEDULE_JENKINS_BUILD_WEATHER", "*/9 * * * *")
cfg = Settings.from_env()
assert cfg.jenkins_base_url == "https://ci.bstein.dev"
assert cfg.jenkins_api_user == "ariadne"
assert cfg.jenkins_api_token == "token"
assert cfg.jenkins_api_timeout_sec == 8.5
assert cfg.jenkins_build_weather_cron == "*/9 * * * *"

View File

@ -345,6 +345,31 @@ def test_update_schedule_state_executes() -> None:
assert db.executed
def test_list_schedule_states_returns_valid_rows() -> None:
db = DummyDB()
now = datetime.now()
db.rows = [
{
"task_name": "schedule.nightly",
"cron_expr": "30 4 * * *",
"last_started_at": now,
"last_finished_at": now,
"last_status": "ok",
"last_error": None,
"last_duration_ms": 10,
"next_run_at": None,
},
{"task_name": None, "cron_expr": "bad"},
]
storage = Storage(db)
states = storage.list_schedule_states()
assert len(states) == 1
assert states[0].task_name == "schedule.nightly"
assert states[0].last_status == "ok"
def test_record_cluster_state_executes() -> None:
db = DummyDB()
storage = Storage(db)
@ -359,8 +384,27 @@ def test_prune_cluster_state_skips_zero() -> None:
assert not db.executed
def test_prune_cluster_state_executes_with_positive_keep() -> None:
db = DummyDB()
storage = Storage(db)
storage.prune_cluster_state(3)
assert db.executed[-1][1] == (3,)
def test_latest_cluster_state_parses_json() -> None:
db = DummyDB(row={"snapshot": "{\"ok\": true}", "created_at": datetime.now()})
storage = Storage(db)
snapshot = storage.latest_cluster_state()
assert snapshot == {"ok": True}
def test_latest_cluster_state_handles_empty_and_native_snapshots() -> None:
assert Storage(DummyDB(row=None)).latest_cluster_state() is None
assert Storage(DummyDB(row={"snapshot": {"ok": True}, "created_at": datetime.now()})).latest_cluster_state() == {"ok": True}
def test_latest_cluster_state_rejects_bad_snapshot_payloads() -> None:
assert Storage(DummyDB(row={"snapshot": "{bad", "created_at": datetime.now()})).latest_cluster_state() is None
assert Storage(DummyDB(row={"snapshot": 42, "created_at": datetime.now()})).latest_cluster_state() is None

View File

@ -81,6 +81,19 @@ def test_safe_error_detail_timeout() -> None:
assert safe_error_detail(exc, "fallback") == "timeout"
def test_safe_error_detail_http_status_without_message() -> None:
request = httpx.Request("GET", "https://example.com")
response = httpx.Response(503, json={"detail": "hidden"}, request=request)
exc = httpx.HTTPStatusError("bad", request=request, response=response)
assert safe_error_detail(exc, "fallback") == "http 503"
def test_safe_error_detail_fallbacks_for_empty_runtime_and_generic() -> None:
assert safe_error_detail(RuntimeError(" "), "fallback") == "fallback"
assert safe_error_detail(ValueError("internal"), "fallback") == "fallback"
def test_extract_bearer_token() -> None:
request = DummyRequest({"Authorization": "Bearer token123"})
assert extract_bearer_token(request) == "token123"

0
tests/unit/__init__.py Normal file
View File

View File

View File

@ -0,0 +1,27 @@
from __future__ import annotations
import dataclasses
from datetime import datetime, timezone
from fastapi import HTTPException
from fastapi.testclient import TestClient
from ariadne.auth.keycloak import AuthContext
import ariadne.app as app_module
def _client(monkeypatch, ctx: AuthContext) -> TestClient:
monkeypatch.setattr(app_module.authenticator, "authenticate", lambda token: ctx)
monkeypatch.setattr(app_module.provisioning, "start", lambda: None)
monkeypatch.setattr(app_module.scheduler, "start", lambda: None)
monkeypatch.setattr(app_module.provisioning, "stop", lambda: None)
monkeypatch.setattr(app_module.scheduler, "stop", lambda: None)
monkeypatch.setattr(app_module.portal_db, "close", lambda: None)
monkeypatch.setattr(app_module.ariadne_db, "close", lambda: None)
monkeypatch.setattr(app_module.storage, "record_event", lambda *args, **kwargs: None)
monkeypatch.setattr(app_module.storage, "record_task_run", lambda *args, **kwargs: None)
return TestClient(app_module.app)
__all__ = [name for name in globals() if not name.startswith("__")]

Some files were not shown because too many files have changed in this diff Show More