Compare commits

..

29 Commits

Author SHA1 Message Date
flux-bot
5cd4c0227a chore(bstein-dev-home): automated image update 2026-04-22 03:40:43 +00:00
flux-bot
6c4257fae0 chore(bstein-dev-home): automated image update 2026-04-22 03:40:38 +00:00
flux-bot
6a3c2d9085 chore(bstein-dev-home): automated image update 2026-04-22 03:28:37 +00:00
flux-bot
d8acd25bfb chore(bstein-dev-home): automated image update 2026-04-22 03:26:41 +00:00
flux-bot
11da02bf9a chore(bstein-dev-home): automated image update 2026-04-22 02:21:50 +00:00
flux-bot
7bd60845c8 chore(bstein-dev-home): automated image update 2026-04-22 02:19:51 +00:00
flux-bot
d47389b001 chore(bstein-dev-home): automated image update 2026-04-22 00:48:03 +00:00
flux-bot
414fdd4007 chore(bstein-dev-home): automated image update 2026-04-22 00:44:58 +00:00
flux-bot
63d7836633 chore(bstein-dev-home): automated image update 2026-04-22 00:25:00 +00:00
flux-bot
e9d6d347fa chore(bstein-dev-home): automated image update 2026-04-22 00:22:56 +00:00
flux-bot
1e5384d744 chore(bstein-dev-home): automated image update 2026-04-20 01:28:36 +00:00
flux-bot
70accb7ed4 chore(bstein-dev-home): automated image update 2026-04-20 01:27:47 +00:00
flux-bot
8b039bdd0a chore(bstein-dev-home): automated image update 2026-04-19 18:53:56 +00:00
flux-bot
47751a6c7c chore(bstein-dev-home): automated image update 2026-04-19 18:53:44 +00:00
flux-bot
df180700f8 chore(bstein-dev-home): automated image update 2026-04-19 18:50:43 +00:00
flux-bot
a6a6295c63 chore(bstein-dev-home): automated image update 2026-04-19 18:49:56 +00:00
flux-bot
f5ca9204eb chore(bstein-dev-home): automated image update 2026-04-18 20:59:34 +00:00
flux-bot
9c7a6490f2 chore(bstein-dev-home): automated image update 2026-04-18 20:59:06 +00:00
flux-bot
9f64da2250 chore(bstein-dev-home): automated image update 2026-04-18 16:29:59 +00:00
flux-bot
615f62d7b2 chore(bstein-dev-home): automated image update 2026-04-18 16:29:34 +00:00
flux-bot
34d0bf64fd chore(maintenance): automated image update 2026-04-13 05:01:46 +00:00
flux-bot
ed1f18f8c4 chore(maintenance): automated image update 2026-04-12 08:05:05 +00:00
flux-bot
bf4a0fbc1c chore(maintenance): automated image update 2026-04-12 08:02:05 +00:00
flux-bot
d1e5b9187b chore(maintenance): automated image update 2026-04-11 01:50:17 +00:00
flux-bot
c7cd723ee8 chore(bstein-dev-home): automated image update 2026-04-10 09:46:06 +00:00
flux-bot
4e177873c2 chore(bstein-dev-home): automated image update 2026-04-10 09:44:05 +00:00
flux-bot
2ca43b9c8b chore(maintenance): automated image update 2026-04-10 09:04:45 +00:00
flux-bot
afd6cb814b chore(bstein-dev-home): automated image update 2026-03-30 19:16:00 +00:00
flux-bot
1c68a44359 chore(maintenance): automated image update 2026-03-30 19:10:30 +00:00
220 changed files with 4147 additions and 34258 deletions

347
Jenkinsfile vendored
View File

@ -12,19 +12,8 @@ spec:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: jnlp
image: jenkins/inbound-agent:3355.v388858a_47b_33-2-jdk21
resources:
requests:
cpu: "25m"
memory: "256Mi"
- name: python
image: registry.bstein.dev/bstein/python:3.12-slim
command:
- cat
tty: true
- name: quality-tools
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
image: python:3.12-slim
command:
- cat
tty: true
@ -34,21 +23,6 @@ spec:
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
SUITE_NAME = 'titan_iac'
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
SONARQUBE_PROJECT_KEY = 'titan_iac'
SONARQUBE_TOKEN = credentials('sonarqube-token')
VM_URL = 'http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428'
QUALITY_GATE_SONARQUBE_ENFORCE = '1'
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
QUALITY_GATE_IRONBANK_ENFORCE = '1'
QUALITY_GATE_IRONBANK_REQUIRED = '0'
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
}
options {
disableConcurrentBuilds()
buildDiscarder(logRotator(daysToKeepStr: '30', numToKeepStr: '200', artifactDaysToKeepStr: '30', artifactNumToKeepStr: '120'))
}
stages {
stage('Checkout') {
@ -58,295 +32,12 @@ spec:
}
stage('Install deps') {
steps {
sh '''
set -eu
if ! command -v git >/dev/null 2>&1; then
apt-get update
apt-get install -y --no-install-recommends git ca-certificates
rm -rf /var/lib/apt/lists/*
fi
pip install --no-cache-dir -r ci/requirements.txt
'''
sh 'pip install --no-cache-dir -r ci/requirements.txt'
}
}
stage('Prepare local quality evidence') {
stage('Glue tests') {
steps {
sh '''
set -eu
mkdir -p build
set +e
python3 -m testing.quality_gate --profile local --build-dir build
local_quality_rc=$?
set -e
printf '%s\n' "${local_quality_rc}" > build/local-quality-gate.rc
'''
}
}
stage('Collect SonarQube evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
args=(
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
"-Dsonar.login=${SONARQUBE_TOKEN}"
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.sources=."
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**,services/monitoring/dashboards/**,services/monitoring/grafana-dashboard-*.yaml"
"-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
)
[ -f build/coverage-unit.xml ] && args+=("-Dsonar.python.coverage.reportPaths=build/coverage-unit.xml")
set +e
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
rc=${PIPESTATUS[0]}
set -e
printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
'''
}
sh '''
set -eu
mkdir -p build
python3 - <<'PY'
import base64
import json
import os
import time
import urllib.parse
import urllib.request
from pathlib import Path
host = os.getenv('SONARQUBE_HOST_URL', '').strip().rstrip('/')
project_key = os.getenv('SONARQUBE_PROJECT_KEY', '').strip()
token = os.getenv('SONARQUBE_TOKEN', '').strip()
report_path = os.getenv('QUALITY_GATE_SONARQUBE_REPORT', 'build/sonarqube-quality-gate.json')
payload = {
"status": "ERROR",
"note": "missing SONARQUBE_HOST_URL and/or SONARQUBE_PROJECT_KEY",
}
if host and project_key:
task_file = Path('.scannerwork/report-task.txt')
task_id = ''
if task_file.exists():
for line in task_file.read_text(encoding='utf-8').splitlines():
key, _, value = line.partition('=')
if key == 'ceTaskId':
task_id = value.strip()
break
if task_id:
ce_query = urllib.parse.urlencode({"id": task_id})
deadline = time.monotonic() + 180
while time.monotonic() < deadline:
ce_request = urllib.request.Request(f"{host}/api/ce/task?{ce_query}", method="GET")
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
ce_request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(ce_request, timeout=12) as response:
ce_payload = json.loads(response.read().decode("utf-8"))
except Exception:
time.sleep(3)
continue
status = str(ce_payload.get("task", {}).get("status", "")).upper()
if status in {"SUCCESS", "FAILED", "CANCELED"}:
break
time.sleep(3)
query = urllib.parse.urlencode({"projectKey": project_key})
request = urllib.request.Request(
f"{host}/api/qualitygates/project_status?{query}",
method="GET",
)
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(request, timeout=12) as response:
payload = json.loads(response.read().decode("utf-8"))
except Exception as exc: # noqa: BLE001
payload = {"status": "ERROR", "error": str(exc)}
with open(report_path, "w", encoding="utf-8") as handle:
json.dump(payload, handle, indent=2, sort_keys=True)
handle.write("\\n")
PY
'''
}
}
stage('Collect IronBank evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
set +e
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --skip-files clusters/atlas/flux-system/gotk-components.yaml --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
trivy_rc=$?
set -e
if [ ! -s build/trivy-fs.json ]; then
cat > build/ironbank-compliance.json <<EOF
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
EOF
exit 0
fi
'''
}
sh '''
set -eu
mkdir -p build
if [ -s build/trivy-fs.json ]; then
python3 ci/scripts/supply_chain_report.py --trivy-json build/trivy-fs.json --waivers ci/titan-iac-trivy-waivers.json --output build/ironbank-compliance.json
exit 0
fi
python3 - <<'PY'
import json
import os
from pathlib import Path
report_path = Path(os.getenv('QUALITY_GATE_IRONBANK_REPORT', 'build/ironbank-compliance.json'))
if report_path.exists():
raise SystemExit(0)
status = os.getenv('IRONBANK_COMPLIANCE_STATUS', '').strip()
compliant = os.getenv('IRONBANK_COMPLIANT', '').strip().lower()
payload = {
"status": status or "unknown",
"compliant": compliant in {"1", "true", "yes", "on"} if compliant else None,
}
payload = {k: v for k, v in payload.items() if v is not None}
if "status" not in payload:
payload["status"] = "unknown"
payload["note"] = (
"Set IRONBANK_COMPLIANCE_STATUS/IRONBANK_COMPLIANT "
"or write build/ironbank-compliance.json in image-building repos."
)
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\\n", encoding="utf-8")
PY
'''
}
}
stage('Run quality gate') {
steps {
sh '''
set -eu
mkdir -p build
set +e
python3 -m testing.quality_gate --profile jenkins --build-dir build
quality_gate_rc=$?
set -e
printf '%s\n' "${quality_gate_rc}" > build/quality-gate.rc
'''
}
}
stage('Publish test metrics') {
steps {
sh '''
set -eu
export JUNIT_GLOB='build/junit-*.xml'
export QUALITY_GATE_EXIT_CODE_PATH='build/quality-gate.rc'
export QUALITY_GATE_SUMMARY_PATH='build/quality-gate-summary.json'
python3 ci/scripts/publish_test_metrics.py
'''
}
}
stage('Enforce quality gate') {
steps {
sh '''
set -euo pipefail
gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)"
fail=0
if [ "${gate_rc}" -ne 0 ]; then
echo "quality gate failed with rc=${gate_rc}" >&2
fail=1
fi
enabled() {
case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
1|true|yes|on) return 0 ;;
*) return 1 ;;
esac
}
if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
sonar_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/sonarqube-quality-gate.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
print(status or "missing")
PY
)"
case "${sonar_status}" in
ok|pass|passed|success) ;;
*)
echo "sonarqube gate failed: ${sonar_status}" >&2
fail=1
;;
esac
fi
ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
ironbank_required=1
fi
if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
supply_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/ironbank-compliance.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
compliant = payload.get("compliant")
if compliant is True:
print("ok")
elif compliant is False:
print("failed")
else:
status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
print(status or "missing")
PY
)"
case "${supply_status}" in
ok|pass|passed|success|compliant) ;;
not_applicable|na|n/a)
if enabled "${ironbank_required}"; then
echo "supply chain gate required but status=${supply_status}" >&2
fail=1
fi
;;
*)
if enabled "${ironbank_required}"; then
echo "supply chain gate failed: ${supply_status}" >&2
fail=1
else
echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
fi
;;
esac
fi
exit "${fail}"
'''
sh 'pytest -q ci/tests/glue'
}
}
stage('Resolve Flux branch') {
@ -354,7 +45,7 @@ PY
script {
env.FLUX_BRANCH = sh(
returnStdout: true,
script: "grep -m1 '^\\s*branch:' clusters/atlas/flux-system/gotk-sync.yaml | sed 's/^\\s*branch:\\s*//'"
script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
).trim()
if (!env.FLUX_BRANCH) {
error('Flux branch not found in gotk-sync.yaml')
@ -373,20 +64,6 @@ PY
steps {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set -euo pipefail
if ! command -v git >/dev/null 2>&1; then
if command -v apk >/dev/null 2>&1; then
apk add --no-cache git >/dev/null
elif command -v apt-get >/dev/null 2>&1; then
apt-get update >/dev/null
apt-get install -y git >/dev/null
fi
fi
cd "${WORKSPACE:-$PWD}"
if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
echo "workspace is not a git checkout; skipping promote"
exit 0
fi
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
@ -397,18 +74,4 @@ PY
}
}
}
post {
always {
script {
if (fileExists('build/junit-unit.xml') || fileExists('build/junit-glue.xml')) {
try {
junit allowEmptyResults: true, testResults: 'build/junit-*.xml'
} catch (Throwable err) {
echo "junit step unavailable: ${err.class.simpleName}"
}
}
}
archiveArtifacts artifacts: 'build/**', allowEmptyArchive: true, fingerprint: true
}
}
}

View File

@ -1,29 +1,3 @@
# titan-iac
Flux-managed Kubernetes desired-state config for `bstein.dev`.
Canonical source URL:
- `ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git`
## Scope
This repo contains cluster configuration consumed by Flux:
- platform/infrastructure manifests
- service manifests and kustomizations
- operational scripts for render/reconcile workflows
This repo is **not** the Ananke application source repo.
Ananke lives in `bstein/ananke` and orchestrates host-side shutdown/startup behavior around this desired state.
## Validation workflow
```bash
kustomize build services/<app>
kubectl apply --server-side --dry-run=client -k services/<app>
flux reconcile kustomization <name> --namespace flux-system --with-source
```
## Apply model
Use Git + Flux as the source of truth.
Avoid manual in-cluster edits for durable changes.
Flux-managed Kubernetes cluster for bstein.dev services.

View File

@ -11,19 +11,8 @@ spec:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: jnlp
image: jenkins/inbound-agent:3355.v388858a_47b_33-2-jdk21
resources:
requests:
cpu: "25m"
memory: "256Mi"
- name: python
image: registry.bstein.dev/bstein/python:3.12-slim
command:
- cat
tty: true
- name: quality-tools
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
image: python:3.12-slim
command:
- cat
tty: true
@ -33,21 +22,6 @@ spec:
environment {
PIP_DISABLE_PIP_VERSION_CHECK = '1'
PYTHONUNBUFFERED = '1'
SUITE_NAME = 'titan_iac'
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
SONARQUBE_PROJECT_KEY = 'titan_iac'
SONARQUBE_TOKEN = credentials('sonarqube-token')
VM_URL = 'http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428'
QUALITY_GATE_SONARQUBE_ENFORCE = '1'
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
QUALITY_GATE_IRONBANK_ENFORCE = '1'
QUALITY_GATE_IRONBANK_REQUIRED = '0'
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
}
options {
disableConcurrentBuilds()
buildDiscarder(logRotator(daysToKeepStr: '30', numToKeepStr: '200', artifactDaysToKeepStr: '30', artifactNumToKeepStr: '120'))
}
stages {
stage('Checkout') {
@ -57,295 +31,12 @@ spec:
}
stage('Install deps') {
steps {
sh '''
set -eu
if ! command -v git >/dev/null 2>&1; then
apt-get update
apt-get install -y --no-install-recommends git ca-certificates
rm -rf /var/lib/apt/lists/*
fi
pip install --no-cache-dir -r ci/requirements.txt
'''
sh 'pip install --no-cache-dir -r ci/requirements.txt'
}
}
stage('Prepare local quality evidence') {
stage('Glue tests') {
steps {
sh '''
set -eu
mkdir -p build
set +e
python3 -m testing.quality_gate --profile local --build-dir build
local_quality_rc=$?
set -e
printf '%s\n' "${local_quality_rc}" > build/local-quality-gate.rc
'''
}
}
stage('Collect SonarQube evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
args=(
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
"-Dsonar.login=${SONARQUBE_TOKEN}"
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.sources=."
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**,services/monitoring/dashboards/**,services/monitoring/grafana-dashboard-*.yaml"
"-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
)
[ -f build/coverage-unit.xml ] && args+=("-Dsonar.python.coverage.reportPaths=build/coverage-unit.xml")
set +e
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
rc=${PIPESTATUS[0]}
set -e
printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
'''
}
sh '''
set -eu
mkdir -p build
python3 - <<'PY'
import base64
import json
import os
import time
import urllib.parse
import urllib.request
from pathlib import Path
host = os.getenv('SONARQUBE_HOST_URL', '').strip().rstrip('/')
project_key = os.getenv('SONARQUBE_PROJECT_KEY', '').strip()
token = os.getenv('SONARQUBE_TOKEN', '').strip()
report_path = os.getenv('QUALITY_GATE_SONARQUBE_REPORT', 'build/sonarqube-quality-gate.json')
payload = {
"status": "ERROR",
"note": "missing SONARQUBE_HOST_URL and/or SONARQUBE_PROJECT_KEY",
}
if host and project_key:
task_file = Path('.scannerwork/report-task.txt')
task_id = ''
if task_file.exists():
for line in task_file.read_text(encoding='utf-8').splitlines():
key, _, value = line.partition('=')
if key == 'ceTaskId':
task_id = value.strip()
break
if task_id:
ce_query = urllib.parse.urlencode({"id": task_id})
deadline = time.monotonic() + 180
while time.monotonic() < deadline:
ce_request = urllib.request.Request(f"{host}/api/ce/task?{ce_query}", method="GET")
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
ce_request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(ce_request, timeout=12) as response:
ce_payload = json.loads(response.read().decode("utf-8"))
except Exception:
time.sleep(3)
continue
status = str(ce_payload.get("task", {}).get("status", "")).upper()
if status in {"SUCCESS", "FAILED", "CANCELED"}:
break
time.sleep(3)
query = urllib.parse.urlencode({"projectKey": project_key})
request = urllib.request.Request(
f"{host}/api/qualitygates/project_status?{query}",
method="GET",
)
if token:
encoded = base64.b64encode(f"{token}:".encode("utf-8")).decode("utf-8")
request.add_header("Authorization", f"Basic {encoded}")
try:
with urllib.request.urlopen(request, timeout=12) as response:
payload = json.loads(response.read().decode("utf-8"))
except Exception as exc: # noqa: BLE001
payload = {"status": "ERROR", "error": str(exc)}
with open(report_path, "w", encoding="utf-8") as handle:
json.dump(payload, handle, indent=2, sort_keys=True)
handle.write("\\n")
PY
'''
}
}
stage('Collect IronBank evidence') {
steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
set +e
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --skip-files clusters/atlas/flux-system/gotk-components.yaml --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
trivy_rc=$?
set -e
if [ ! -s build/trivy-fs.json ]; then
cat > build/ironbank-compliance.json <<EOF
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
EOF
exit 0
fi
'''
}
sh '''
set -eu
mkdir -p build
if [ -s build/trivy-fs.json ]; then
python3 ci/scripts/supply_chain_report.py --trivy-json build/trivy-fs.json --waivers ci/titan-iac-trivy-waivers.json --output build/ironbank-compliance.json
exit 0
fi
python3 - <<'PY'
import json
import os
from pathlib import Path
report_path = Path(os.getenv('QUALITY_GATE_IRONBANK_REPORT', 'build/ironbank-compliance.json'))
if report_path.exists():
raise SystemExit(0)
status = os.getenv('IRONBANK_COMPLIANCE_STATUS', '').strip()
compliant = os.getenv('IRONBANK_COMPLIANT', '').strip().lower()
payload = {
"status": status or "unknown",
"compliant": compliant in {"1", "true", "yes", "on"} if compliant else None,
}
payload = {k: v for k, v in payload.items() if v is not None}
if "status" not in payload:
payload["status"] = "unknown"
payload["note"] = (
"Set IRONBANK_COMPLIANCE_STATUS/IRONBANK_COMPLIANT "
"or write build/ironbank-compliance.json in image-building repos."
)
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\\n", encoding="utf-8")
PY
'''
}
}
stage('Run quality gate') {
steps {
sh '''
set -eu
mkdir -p build
set +e
python3 -m testing.quality_gate --profile jenkins --build-dir build
quality_gate_rc=$?
set -e
printf '%s\n' "${quality_gate_rc}" > build/quality-gate.rc
'''
}
}
stage('Publish test metrics') {
steps {
sh '''
set -eu
export JUNIT_GLOB='build/junit-*.xml'
export QUALITY_GATE_EXIT_CODE_PATH='build/quality-gate.rc'
export QUALITY_GATE_SUMMARY_PATH='build/quality-gate-summary.json'
python3 ci/scripts/publish_test_metrics.py
'''
}
}
stage('Enforce quality gate') {
steps {
sh '''
set -euo pipefail
gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)"
fail=0
if [ "${gate_rc}" -ne 0 ]; then
echo "quality gate failed with rc=${gate_rc}" >&2
fail=1
fi
enabled() {
case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
1|true|yes|on) return 0 ;;
*) return 1 ;;
esac
}
if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
sonar_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/sonarqube-quality-gate.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
print(status or "missing")
PY
)"
case "${sonar_status}" in
ok|pass|passed|success) ;;
*)
echo "sonarqube gate failed: ${sonar_status}" >&2
fail=1
;;
esac
fi
ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
ironbank_required=1
fi
if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
supply_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/ironbank-compliance.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
compliant = payload.get("compliant")
if compliant is True:
print("ok")
elif compliant is False:
print("failed")
else:
status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
print(status or "missing")
PY
)"
case "${supply_status}" in
ok|pass|passed|success|compliant) ;;
not_applicable|na|n/a)
if enabled "${ironbank_required}"; then
echo "supply chain gate required but status=${supply_status}" >&2
fail=1
fi
;;
*)
if enabled "${ironbank_required}"; then
echo "supply chain gate failed: ${supply_status}" >&2
fail=1
else
echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
fi
;;
esac
fi
exit "${fail}"
'''
sh 'pytest -q ci/tests/glue'
}
}
stage('Resolve Flux branch') {
@ -372,20 +63,6 @@ PY
steps {
withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
sh '''
set -euo pipefail
if ! command -v git >/dev/null 2>&1; then
if command -v apk >/dev/null 2>&1; then
apk add --no-cache git >/dev/null
elif command -v apt-get >/dev/null 2>&1; then
apt-get update >/dev/null
apt-get install -y git >/dev/null
fi
fi
cd "${WORKSPACE:-$PWD}"
if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
echo "workspace is not a git checkout; skipping promote"
exit 0
fi
set +x
git config user.email "jenkins@bstein.dev"
git config user.name "jenkins"
@ -396,18 +73,4 @@ PY
}
}
}
post {
always {
script {
if (fileExists('build/junit-unit.xml') || fileExists('build/junit-glue.xml')) {
try {
junit allowEmptyResults: true, testResults: 'build/junit-*.xml'
} catch (Throwable err) {
echo "junit step unavailable: ${err.class.simpleName}"
}
}
}
archiveArtifacts artifacts: 'build/**', allowEmptyArchive: true, fingerprint: true
}
}
}

View File

@ -1,7 +1,4 @@
pytest==8.3.4
pytest-cov==6.0.0
coverage==7.6.10
kubernetes==30.1.0
PyYAML==6.0.2
requests==2.32.3
ruff==0.8.4

View File

@ -1,352 +0,0 @@
#!/usr/bin/env python3
"""Publish titan-iac quality-gate results to Pushgateway."""
from __future__ import annotations
import json
import os
from glob import glob
from pathlib import Path
import sys
import urllib.error
import urllib.request
import xml.etree.ElementTree as ET
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from ci.scripts import publish_test_metrics_quality as _quality_helpers
CANONICAL_CHECKS = _quality_helpers.CANONICAL_CHECKS
_build_check_statuses = _quality_helpers._build_check_statuses
_combine_statuses = _quality_helpers._combine_statuses
_infer_sonarqube_status = _quality_helpers._infer_sonarqube_status
_infer_source_lines_over_500 = _quality_helpers._infer_source_lines_over_500
_infer_supply_chain_status = _quality_helpers._infer_supply_chain_status
_infer_workspace_coverage_percent = _quality_helpers._infer_workspace_coverage_percent
_load_optional_json = _quality_helpers._load_optional_json
_normalize_result_status = _quality_helpers._normalize_result_status
def _escape_label(value: str) -> str:
"""Escape a Prometheus label value without changing its content."""
return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
def _label_str(labels: dict[str, str]) -> str:
"""Render a stable Prometheus label set from a mapping."""
parts = [f'{key}="{_escape_label(val)}"' for key, val in labels.items() if val]
return "{" + ",".join(parts) + "}" if parts else ""
def _read_text(url: str) -> str:
"""Fetch a plain-text response body from the given URL."""
with urllib.request.urlopen(url, timeout=10) as response:
return response.read().decode("utf-8")
def _post_text(url: str, payload: str) -> None:
"""PUT a plain-text payload and fail on any 4xx/5xx response."""
request = urllib.request.Request(
url,
data=payload.encode("utf-8"),
method="PUT",
headers={"Content-Type": "text/plain"},
)
with urllib.request.urlopen(request, timeout=10) as response:
if response.status >= 400:
raise RuntimeError(f"push failed with status={response.status}")
def _parse_junit(path: str) -> dict[str, int]:
"""Parse a JUnit XML file into aggregate test counters."""
if not os.path.exists(path):
return {"tests": 0, "failures": 0, "errors": 0, "skipped": 0}
tree = ET.parse(path)
root = tree.getroot()
totals = {"tests": 0, "failures": 0, "errors": 0, "skipped": 0}
suites: list[ET.Element]
if root.tag == "testsuite":
suites = [root]
elif root.tag == "testsuites":
suites = [elem for elem in root if elem.tag == "testsuite"]
else:
suites = []
for suite in suites:
for key in totals:
raw_value = suite.attrib.get(key, "0")
try:
totals[key] += int(float(raw_value))
except ValueError:
totals[key] += 0
return totals
def _collect_junit_totals(pattern: str) -> dict[str, int]:
"""Sum JUnit counters across every XML file matching the pattern."""
totals = {"tests": 0, "failures": 0, "errors": 0, "skipped": 0}
for path in sorted(glob(pattern)):
parsed = _parse_junit(path)
for key in totals:
totals[key] += parsed[key]
return totals
def _collect_junit_cases(pattern: str) -> list[tuple[str, str]]:
"""Collect individual JUnit test-case statuses for flaky-test trend panels."""
cases: list[tuple[str, str]] = []
for path in sorted(glob(pattern)):
if not os.path.exists(path):
continue
root = ET.parse(path).getroot()
suites: list[ET.Element]
if root.tag == "testsuite":
suites = [root]
elif root.tag == "testsuites":
suites = [elem for elem in root if elem.tag == "testsuite"]
else:
suites = []
for suite in suites:
for test_case in suite.findall("testcase"):
case_name = test_case.attrib.get("name", "").strip()
class_name = test_case.attrib.get("classname", "").strip()
if not case_name:
continue
full_name = f"{class_name}.{case_name}" if class_name else case_name
status = "passed"
if test_case.find("failure") is not None or test_case.find("error") is not None:
status = "failed"
elif test_case.find("skipped") is not None:
status = "skipped"
cases.append((full_name, status))
return cases
def _read_exit_code(path: str) -> int:
"""Read the quality-gate exit code, defaulting to failure if missing."""
try:
with open(path, "r", encoding="utf-8") as handle:
return int(handle.read().strip())
except (FileNotFoundError, ValueError):
return 1
def _load_summary(path: str) -> dict:
"""Load the JSON quality-gate summary, returning an empty mapping on error."""
try:
with open(path, "r", encoding="utf-8") as handle:
return json.load(handle)
except (FileNotFoundError, json.JSONDecodeError):
return {}
def _summary_float(summary: dict, key: str) -> float:
"""Extract a float-like value from the summary, defaulting to 0.0."""
value = summary.get(key)
if isinstance(value, (int, float)):
return float(value)
return 0.0
def _summary_int(summary: dict, key: str) -> int:
"""Extract an int-like value from the summary, defaulting to 0."""
value = summary.get(key)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(value)
return 0
def _fetch_existing_counter(pushgateway_url: str, metric: str, labels: dict[str, str]) -> float:
"""Return the current counter value for a labeled metric if present."""
text = _read_text(f"{pushgateway_url.rstrip('/')}/metrics")
for line in text.splitlines():
if not line.startswith(metric + "{"):
continue
if any(f'{key}="{value}"' not in line for key, value in labels.items()):
continue
parts = line.split()
if len(parts) < 2:
continue
try:
return float(parts[1])
except ValueError:
return 0.0
return 0.0
def _build_payload(
suite: str,
status: str,
tests: dict[str, int],
test_cases: list[tuple[str, str]],
ok_count: int,
failed_count: int,
branch: str,
build_number: str,
jenkins_job: str,
summary: dict | None = None,
workspace_line_coverage_percent: float = 0.0,
source_lines_over_500: int = 0,
check_statuses: dict[str, str] | None = None,
) -> str:
"""Build the Pushgateway payload for the current suite run."""
passed = max(tests["tests"] - tests["failures"] - tests["errors"] - tests["skipped"], 0)
build_labels = _label_str(
{
"suite": suite,
"branch": branch or "unknown",
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job or suite,
}
)
test_case_base_labels = {
"suite": suite,
"branch": branch or "unknown",
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job or suite,
}
lines = [
"# TYPE platform_quality_gate_runs_total counter",
f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count}',
f'platform_quality_gate_runs_total{{suite="{suite}",status="failed"}} {failed_count}',
"# TYPE titan_iac_quality_gate_tests_total gauge",
f'titan_iac_quality_gate_tests_total{{suite="{suite}",result="passed"}} {passed}',
f'titan_iac_quality_gate_tests_total{{suite="{suite}",result="failed"}} {tests["failures"]}',
f'titan_iac_quality_gate_tests_total{{suite="{suite}",result="error"}} {tests["errors"]}',
f'titan_iac_quality_gate_tests_total{{suite="{suite}",result="skipped"}} {tests["skipped"]}',
"# TYPE titan_iac_quality_gate_run_status gauge",
f'titan_iac_quality_gate_run_status{{suite="{suite}",status="ok"}} {1 if status == "ok" else 0}',
f'titan_iac_quality_gate_run_status{{suite="{suite}",status="failed"}} {1 if status == "failed" else 0}',
"# TYPE platform_quality_gate_build_info gauge",
f"platform_quality_gate_build_info{build_labels} 1",
"# TYPE titan_iac_quality_gate_build_info gauge",
f"titan_iac_quality_gate_build_info{build_labels} 1",
"# TYPE platform_quality_gate_workspace_line_coverage_percent gauge",
f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {workspace_line_coverage_percent:.3f}',
"# TYPE platform_quality_gate_source_lines_over_500_total gauge",
f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
]
if check_statuses:
lines.append("# TYPE titan_iac_quality_gate_checks_total gauge")
for check_name in CANONICAL_CHECKS:
check_status = check_statuses.get(check_name, "not_applicable")
lines.append(
f'titan_iac_quality_gate_checks_total{{suite="{suite}",check="{_escape_label(check_name)}",result="{_escape_label(check_status)}"}} 1'
)
lines.append("# TYPE platform_quality_gate_test_case_result gauge")
if test_cases:
for test_name, test_status in test_cases:
labels = {
**test_case_base_labels,
"test": test_name,
"status": test_status,
}
lines.append(
f"platform_quality_gate_test_case_result{_label_str(labels)} 1"
)
else:
labels = {**test_case_base_labels, "test": "__no_test_cases__", "status": "skipped"}
lines.append(
f"platform_quality_gate_test_case_result{_label_str(labels)} 1"
)
return "\n".join(lines) + "\n"
def main() -> int:
"""Publish the quality-gate metrics and print a compact run summary."""
suite = os.getenv("SUITE_NAME", "titan_iac")
pushgateway_url = os.getenv("PUSHGATEWAY_URL", "http://platform-quality-gateway.monitoring.svc.cluster.local:9091")
job_name = os.getenv("QUALITY_GATE_JOB_NAME", "platform-quality-ci")
junit_glob = os.getenv("JUNIT_GLOB", os.getenv("JUNIT_PATH", "build/junit-*.xml"))
exit_code_path = os.getenv("QUALITY_GATE_EXIT_CODE_PATH", os.getenv("GLUE_EXIT_CODE_PATH", "build/quality-gate.rc"))
summary_path = os.getenv("QUALITY_GATE_SUMMARY_PATH", "build/quality-gate-summary.json")
branch = os.getenv("BRANCH_NAME") or os.getenv("GIT_BRANCH") or "unknown"
if branch.startswith("origin/"):
branch = branch[len("origin/") :]
build_number = os.getenv("BUILD_NUMBER", "")
jenkins_job = os.getenv("JOB_NAME", "titan-iac")
tests = _collect_junit_totals(junit_glob)
test_cases = _collect_junit_cases(junit_glob)
exit_code = _read_exit_code(exit_code_path)
status = "ok" if exit_code == 0 else "failed"
summary = _load_summary(summary_path)
workspace_line_coverage_percent = _summary_float(summary, "workspace_line_coverage_percent")
if workspace_line_coverage_percent <= 0:
workspace_line_coverage_percent = _infer_workspace_coverage_percent(summary, "build/coverage-unit.xml")
source_lines_over_500 = _summary_int(summary, "source_lines_over_500")
if source_lines_over_500 <= 0:
source_lines_over_500 = _infer_source_lines_over_500(summary)
sonarqube_report = _load_optional_json(os.getenv("QUALITY_GATE_SONARQUBE_REPORT", "build/sonarqube-quality-gate.json"))
supply_chain_report = _load_optional_json(os.getenv("QUALITY_GATE_IRONBANK_REPORT", "build/ironbank-compliance.json"))
supply_chain_required = os.getenv("QUALITY_GATE_IRONBANK_REQUIRED", "0").strip().lower() in {"1", "true", "yes", "on"}
check_statuses = _build_check_statuses(
summary=summary,
tests=tests,
workspace_line_coverage_percent=workspace_line_coverage_percent,
source_lines_over_500=source_lines_over_500,
sonarqube_report=sonarqube_report,
supply_chain_report=supply_chain_report,
supply_chain_required=supply_chain_required,
)
ok_count = int(
_fetch_existing_counter(
pushgateway_url,
"platform_quality_gate_runs_total",
{"job": job_name, "suite": suite, "status": "ok"},
)
)
failed_count = int(
_fetch_existing_counter(
pushgateway_url,
"platform_quality_gate_runs_total",
{"job": job_name, "suite": suite, "status": "failed"},
)
)
if status == "ok":
ok_count += 1
else:
failed_count += 1
payload = _build_payload(
suite=suite,
status=status,
tests=tests,
test_cases=test_cases,
ok_count=ok_count,
failed_count=failed_count,
branch=branch,
build_number=build_number,
jenkins_job=jenkins_job,
summary=summary,
workspace_line_coverage_percent=workspace_line_coverage_percent,
source_lines_over_500=source_lines_over_500,
check_statuses=check_statuses,
)
push_url = f"{pushgateway_url.rstrip('/')}/metrics/job/{job_name}/suite/{suite}"
_post_text(push_url, payload)
summary = {
"suite": suite,
"status": status,
"tests_total": tests["tests"],
"tests_failed": tests["failures"],
"tests_error": tests["errors"],
"tests_skipped": tests["skipped"],
"ok_count": ok_count,
"failed_count": failed_count,
"checks_recorded": len(check_statuses),
"workspace_line_coverage_percent": workspace_line_coverage_percent,
"source_lines_over_500": source_lines_over_500,
}
print(json.dumps(summary, sort_keys=True))
return 0
if __name__ == "__main__": # pragma: no cover
raise SystemExit(main())

View File

@ -1,200 +0,0 @@
#!/usr/bin/env python3
"""Quality/status helpers for publish_test_metrics."""
from __future__ import annotations
import json
from pathlib import Path
import xml.etree.ElementTree as ET
SUCCESS_STATUSES = {"ok", "pass", "passed", "success", "compliant"}
NOT_APPLICABLE_STATUSES = {"not_applicable", "n/a", "na", "none", "skipped"}
FAILED_STATUSES = {"failed", "fail", "error", "errors", "warn", "warning", "red"}
CANONICAL_CHECKS = [
"tests",
"coverage",
"loc",
"docs_naming",
"gate_glue",
"sonarqube",
"supply_chain",
]
def _infer_workspace_coverage_percent(summary: dict, default_xml: str) -> float:
"""Infer workspace line coverage from quality summary coverage XML metadata."""
results = summary.get("results", []) if isinstance(summary, dict) else []
coverage_xml = default_xml
for result in results:
if not isinstance(result, dict):
continue
if str(result.get("name") or "").strip().lower() != "coverage":
continue
candidate = str(result.get("coverage_xml") or "").strip()
if candidate:
coverage_xml = candidate
break
xml_path = Path(coverage_xml)
if not xml_path.exists():
return 0.0
try:
root = ET.parse(xml_path).getroot()
line_rate = root.attrib.get("line-rate")
if line_rate is None:
return 0.0
return float(line_rate) * 100.0
except (ET.ParseError, OSError, ValueError):
return 0.0
def _infer_source_lines_over_500(summary: dict) -> int:
"""Infer over-limit source file count from hygiene issue payloads."""
results = summary.get("results", []) if isinstance(summary, dict) else []
for result in results:
if not isinstance(result, dict):
continue
if str(result.get("name") or "").strip().lower() not in {"hygiene", "loc", "smell"}:
continue
issues = result.get("issues")
if not isinstance(issues, list):
continue
return sum(1 for item in issues if isinstance(item, str) and item.startswith("file exceeds"))
return 0
def _normalize_result_status(value: str | None, default: str = "failed") -> str:
"""Map arbitrary check status text into canonical check result buckets."""
if not value:
return default
normalized = value.strip().lower()
if normalized in SUCCESS_STATUSES:
return "ok"
if normalized in NOT_APPLICABLE_STATUSES:
return "not_applicable"
if normalized in FAILED_STATUSES:
return "failed"
return default
def _load_optional_json(path: str | None) -> dict:
"""Load an optional JSON report file, returning an empty object when absent."""
if not path:
return {}
candidate = Path(path)
if not candidate.exists():
return {}
try:
return json.loads(candidate.read_text(encoding="utf-8"))
except json.JSONDecodeError:
return {}
def _combine_statuses(statuses: list[str]) -> str:
"""Roll up many check statuses into one canonical result."""
if not statuses:
return "not_applicable"
if any(status == "failed" for status in statuses):
return "failed"
if all(status == "not_applicable" for status in statuses):
return "not_applicable"
if all(status in {"ok", "not_applicable"} for status in statuses):
return "ok"
return "failed"
def _infer_sonarqube_status(report: dict) -> str:
"""Infer canonical SonarQube check status from its JSON report payload."""
if not report:
return "not_applicable"
status = (
report.get("projectStatus", {}).get("status")
or report.get("qualityGate", {}).get("status")
or report.get("status")
)
return _normalize_result_status(str(status) if status is not None else None, default="failed")
def _infer_supply_chain_status(report: dict, required: bool) -> str:
"""Infer canonical supply-chain status from IronBank/artifact report payload."""
if not report:
return "failed" if required else "not_applicable"
compliant = report.get("compliant")
if isinstance(compliant, bool):
return "ok" if compliant else "failed"
status = report.get("status")
if status is None:
return "failed" if required else "not_applicable"
normalized = _normalize_result_status(str(status), default="failed")
if normalized == "not_applicable" and required:
return "failed"
return normalized
def _build_check_statuses(
summary: dict | None,
tests: dict[str, int],
workspace_line_coverage_percent: float,
source_lines_over_500: int,
sonarqube_report: dict,
supply_chain_report: dict,
supply_chain_required: bool,
) -> dict[str, str]:
"""Generate the canonical quality-check status map for dashboarding."""
raw_results = summary.get("results", []) if isinstance(summary, dict) else []
status_by_name: dict[str, str] = {}
for result in raw_results:
if not isinstance(result, dict):
continue
check_name = str(result.get("name") or "").strip().lower()
if not check_name:
continue
status_by_name[check_name] = _normalize_result_status(result.get("status"), default="failed")
tests_status = status_by_name.get("tests")
if not tests_status:
candidate_keys = ["unit", "integration", "e2e", "pytest", "test", "tests"]
candidates = [status_by_name[key] for key in candidate_keys if key in status_by_name]
if candidates:
tests_status = _combine_statuses(candidates)
elif tests["tests"] > 0:
tests_status = "ok" if (tests["failures"] + tests["errors"]) == 0 else "failed"
else:
tests_status = "not_applicable"
coverage_status = status_by_name.get("coverage")
if not coverage_status:
if workspace_line_coverage_percent > 0:
coverage_status = "ok" if workspace_line_coverage_percent >= 95.0 else "failed"
else:
coverage_status = "not_applicable"
loc_status = status_by_name.get("loc")
if not loc_status:
loc_status = "ok" if source_lines_over_500 == 0 else "failed"
docs_naming_status = status_by_name.get("docs_naming")
if not docs_naming_status:
candidates = [status_by_name[key] for key in ["docs", "hygiene", "smell", "lint", "naming"] if key in status_by_name]
docs_naming_status = _combine_statuses(candidates) if candidates else "not_applicable"
gate_glue_status = status_by_name.get("gate_glue")
if not gate_glue_status:
candidates = [status_by_name[key] for key in ["gate_glue", "glue", "gate"] if key in status_by_name]
gate_glue_status = _combine_statuses(candidates) if candidates else "not_applicable"
sonarqube_status = status_by_name.get("sonarqube") or _infer_sonarqube_status(sonarqube_report)
supply_chain_status = status_by_name.get("supply_chain") or _infer_supply_chain_status(
supply_chain_report,
required=supply_chain_required,
)
return {
"tests": tests_status,
"coverage": coverage_status,
"loc": loc_status,
"docs_naming": docs_naming_status,
"gate_glue": gate_glue_status,
"sonarqube": sonarqube_status,
"supply_chain": supply_chain_status,
}

View File

@ -1,173 +0,0 @@
"""Build a titan-iac supply-chain compliance report from Trivy evidence."""
from __future__ import annotations
import argparse
import datetime as dt
import json
from pathlib import Path
from typing import Any
FAIL_SEVERITIES = {"HIGH", "CRITICAL"}
def _read_json(path: Path) -> dict[str, Any]:
"""Read a JSON object from disk for use as pipeline evidence."""
payload = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(payload, dict):
raise ValueError(f"{path} must contain a JSON object")
return payload
def _parse_day(raw: str | None) -> dt.date | None:
"""Parse an ISO day while letting optional waiver dates stay optional."""
if not raw:
return None
return dt.date.fromisoformat(raw)
def _today(override: str | None = None) -> dt.date:
"""Return the policy day so tests can pin expiry behavior."""
return _parse_day(override) or dt.date.today()
def _load_waiver_pairs(path: Path | None, policy_day: dt.date) -> tuple[set[tuple[str, str]], int]:
"""Return active ``(misconfiguration id, target)`` waivers and expired count."""
if path is None or not path.exists():
return set(), 0
payload = _read_json(path)
default_expires_at = payload.get("default_expires_at")
active: set[tuple[str, str]] = set()
expired = 0
for entry in payload.get("misconfigurations", []):
if not isinstance(entry, dict):
continue
misconfiguration_id = str(entry.get("id") or "").strip()
if not misconfiguration_id:
continue
expires_at = _parse_day(str(entry.get("expires_at") or default_expires_at or ""))
targets = entry.get("targets", [])
if not isinstance(targets, list):
continue
if expires_at and expires_at < policy_day:
expired += len(targets)
continue
# Waivers are target-specific so a new unsafe manifest fails until it is
# either fixed or deliberately accepted with a fresh expiration.
for target in targets:
if isinstance(target, str) and target:
active.add((misconfiguration_id, target))
return active, expired
def _iter_failed_misconfigurations(payload: dict[str, Any]):
"""Yield failed high/critical Trivy misconfiguration records."""
for result in payload.get("Results", []):
if not isinstance(result, dict):
continue
target = str(result.get("Target") or "")
for item in result.get("Misconfigurations") or []:
if not isinstance(item, dict):
continue
if item.get("Status") != "FAIL":
continue
if str(item.get("Severity") or "").upper() not in FAIL_SEVERITIES:
continue
yield target, item
def _count_vulnerabilities(payload: dict[str, Any], severity: str) -> int:
"""Count Trivy vulnerabilities at a specific severity."""
count = 0
for result in payload.get("Results", []):
if not isinstance(result, dict):
continue
for item in result.get("Vulnerabilities") or []:
if isinstance(item, dict) and str(item.get("Severity") or "").upper() == severity:
count += 1
return count
def _count_secrets(payload: dict[str, Any]) -> int:
"""Count detected secrets in the Trivy filesystem report."""
count = 0
for result in payload.get("Results", []):
if isinstance(result, dict):
count += len(result.get("Secrets") or [])
return count
def build_report(
trivy_payload: dict[str, Any],
waiver_path: Path | None = None,
today_override: str | None = None,
) -> dict[str, Any]:
"""Build the compliance summary consumed by the quality gate."""
policy_day = _today(today_override)
active_waivers, expired_waivers = _load_waiver_pairs(waiver_path, policy_day)
open_misconfigs: list[dict[str, str]] = []
waived_misconfigs = 0
for target, item in _iter_failed_misconfigurations(trivy_payload):
misconfiguration_id = str(item.get("ID") or "")
if (misconfiguration_id, target) in active_waivers:
waived_misconfigs += 1
continue
open_misconfigs.append(
{
"id": misconfiguration_id,
"target": target,
"severity": str(item.get("Severity") or ""),
"title": str(item.get("Title") or ""),
}
)
critical = _count_vulnerabilities(trivy_payload, "CRITICAL")
high = _count_vulnerabilities(trivy_payload, "HIGH")
secrets = _count_secrets(trivy_payload)
status = "ok" if critical == 0 and secrets == 0 and not open_misconfigs else "failed"
return {
"status": status,
"compliant": status == "ok",
"category": "artifact_security",
"scan_type": "filesystem",
"scanner": "trivy",
"critical_vulnerabilities": critical,
"high_vulnerabilities": high,
"high_vulnerability_policy": "observe",
"secrets": secrets,
"high_or_critical_misconfigurations": len(open_misconfigs),
"waived_misconfigurations": waived_misconfigs,
"expired_waivers": expired_waivers,
"waiver_file": str(waiver_path) if waiver_path else "",
"open_misconfiguration_examples": open_misconfigs[:20],
}
def main(argv: list[str] | None = None) -> int:
"""CLI entrypoint used by Jenkins after the Trivy scan completes."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--trivy-json", required=True)
parser.add_argument("--waivers")
parser.add_argument("--output", required=True)
parser.add_argument("--today")
args = parser.parse_args(argv)
trivy_payload = _read_json(Path(args.trivy_json))
waiver_path = Path(args.waivers) if args.waivers else None
report = build_report(trivy_payload, waiver_path=waiver_path, today_override=args.today)
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(report, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return 0
if __name__ == "__main__": # pragma: no cover
raise SystemExit(main())

View File

@ -1,7 +1,6 @@
max_success_age_hours: 48
allow_suspended:
- bstein-dev-home/vaultwarden-cred-sync
- comms/guest-name-randomizer
- comms/othrys-room-reset
- comms/pin-othrys-invite
- comms/seed-othrys-room
@ -10,7 +9,6 @@ allow_suspended:
- health/wger-user-sync
- mailu-mailserver/mailu-sync-nightly
- nextcloud/nextcloud-mail-sync
- vault/vault-oidc-config
ariadne_schedule_tasks:
- schedule.mailu_sync
- schedule.nextcloud_sync

View File

@ -1,108 +0,0 @@
"""Glue checks for Ariadne schedules exported to VictoriaMetrics."""
from __future__ import annotations
import os
from datetime import datetime, timezone
from pathlib import Path
import requests
import yaml
CONFIG_PATH = Path(__file__).with_name("config.yaml")
def _load_config() -> dict:
with CONFIG_PATH.open("r", encoding="utf-8") as handle:
return yaml.safe_load(handle) or {}
def _query(promql: str) -> list[dict]:
vm_url = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/")
response = requests.get(f"{vm_url}/api/v1/query", params={"query": promql}, timeout=10)
response.raise_for_status()
payload = response.json()
return payload.get("data", {}).get("result", [])
def _expected_tasks() -> list[dict]:
cfg = _load_config()
tasks = [
_normalize_task(item, cfg)
for item in cfg.get("ariadne_schedule_tasks", [])
]
assert tasks, "No Ariadne schedule tasks configured"
return tasks
def _normalize_task(item: object, cfg: dict) -> dict:
if isinstance(item, str):
return {
"task": item,
"check_last_success": True,
"max_success_age_hours": cfg.get("max_success_age_hours", 48),
}
if isinstance(item, dict):
normalized = dict(item)
normalized.setdefault("check_last_success", True)
normalized.setdefault("max_success_age_hours", cfg.get("max_success_age_hours", 48))
return normalized
raise TypeError(f"Unsupported Ariadne schedule task config entry: {item!r}")
def _tracked_tasks(tasks: list[dict]) -> list[dict]:
tracked = [item for item in tasks if item.get("check_last_success")]
assert tracked, "No Ariadne schedule tasks are marked for success tracking"
return tracked
def _task_regex(tasks: list[dict]) -> str:
return "|".join(item["task"] for item in tasks)
def test_ariadne_schedule_series_exist():
tasks = _expected_tasks()
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_next_run_timestamp_seconds{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
assert not missing, f"Missing next-run metrics for: {', '.join(missing)}"
def test_ariadne_schedule_recent_success():
tasks = _tracked_tasks(_expected_tasks())
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_last_success_timestamp_seconds{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
assert not missing, f"Missing last-success metrics for: {', '.join(missing)}"
now = datetime.now(timezone.utc)
age_by_task = {
item.get("metric", {}).get("task"): (now - datetime.fromtimestamp(float(item["value"][1]), tz=timezone.utc)).total_seconds() / 3600
for item in series
}
too_old = [
f"{task} ({age_by_task[task]:.1f}h > {item['max_success_age_hours']}h)"
for item in tasks
if (task := item["task"]) in age_by_task and age_by_task[task] > float(item["max_success_age_hours"])
]
assert not too_old, "Ariadne schedules are stale: " + ", ".join(too_old)
def test_ariadne_schedule_last_status_present_and_boolean():
tasks = _tracked_tasks(_expected_tasks())
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_last_status{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
assert not missing, f"Missing last-status metrics for: {', '.join(missing)}"
invalid = []
for item in series:
task = item.get("metric", {}).get("task")
value = float(item["value"][1])
if value not in (0.0, 1.0):
invalid.append(f"{task}={value}")
assert not invalid, f"Unexpected Ariadne last-status values: {', '.join(invalid)}"

View File

@ -1,5 +1,3 @@
"""Glue checks for the metrics the quality-gate publishes."""
from __future__ import annotations
import os
@ -25,63 +23,26 @@ def _query(promql: str) -> list[dict]:
return payload.get("data", {}).get("result", [])
def _expected_tasks() -> list[dict]:
cfg = _load_config()
tasks = [
_normalize_task(item, cfg)
for item in cfg.get("ariadne_schedule_tasks", [])
]
assert tasks, "No Ariadne schedule tasks configured"
return tasks
def test_glue_metrics_present():
series = _query('kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}')
assert series, "No glue cronjob label series found"
def _normalize_task(item: object, cfg: dict) -> dict:
if isinstance(item, str):
return {
"task": item,
"check_last_success": True,
"max_success_age_hours": cfg.get("max_success_age_hours", 48),
}
if isinstance(item, dict):
normalized = dict(item)
normalized.setdefault("check_last_success", True)
normalized.setdefault("max_success_age_hours", cfg.get("max_success_age_hours", 48))
return normalized
raise TypeError(f"Unsupported Ariadne schedule task config entry: {item!r}")
def _tracked_tasks(tasks: list[dict]) -> list[dict]:
tracked = [item for item in tasks if item.get("check_last_success")]
assert tracked, "No Ariadne schedule tasks are marked for success tracking"
return tracked
def _task_regex(tasks: list[dict]) -> str:
return "|".join(item["task"] for item in tasks)
def test_glue_metrics_success_join():
query = (
"kube_cronjob_status_last_successful_time "
'and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}'
)
series = _query(query)
assert series, "No glue cronjob last success series found"
def test_ariadne_schedule_metrics_present():
tasks = _expected_tasks()
selector = _task_regex(tasks)
series = _query(f'ariadne_schedule_next_run_timestamp_seconds{{task=~"{selector}"}}')
seen = {item.get("metric", {}).get("task") for item in series}
missing = [item["task"] for item in tasks if item["task"] not in seen]
cfg = _load_config()
expected = cfg.get("ariadne_schedule_tasks", [])
if not expected:
return
series = _query("ariadne_schedule_next_run_timestamp_seconds")
tasks = {item.get("metric", {}).get("task") for item in series}
missing = [task for task in expected if task not in tasks]
assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}"
def test_ariadne_schedule_success_and_status_metrics_present():
tasks = _tracked_tasks(_expected_tasks())
selector = _task_regex(tasks)
success = _query(f'ariadne_schedule_last_success_timestamp_seconds{{task=~"{selector}"}}')
status = _query(f'ariadne_schedule_last_status{{task=~"{selector}"}}')
success_tasks = {item.get("metric", {}).get("task") for item in success}
status_tasks = {item.get("metric", {}).get("task") for item in status}
expected = {item["task"] for item in tasks}
missing_success = sorted(expected - success_tasks)
missing_status = sorted(expected - status_tasks)
assert not missing_success, f"Missing Ariadne success metrics for: {', '.join(missing_success)}"
assert not missing_status, f"Missing Ariadne status metrics for: {', '.join(missing_status)}"

View File

@ -1,401 +0,0 @@
{
"version": 1,
"generated_from": "Jenkins titan-iac build 225 Trivy filesystem scan",
"default_expires_at": "2026-05-22",
"ticket": "atlas-quality-wave-k8s-hardening",
"default_reason": "Existing Kubernetes manifest hardening baseline accepted only for the first quality-gate rollout; fix or renew explicitly before expiry.",
"misconfigurations": [
{
"id": "DS-0002",
"targets": [
"dockerfiles/Dockerfile.ananke-node-helper"
]
},
{
"id": "KSV-0009",
"targets": [
"services/mailu/vip-controller.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml"
]
},
{
"id": "KSV-0010",
"targets": [
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml"
]
},
{
"id": "KSV-0014",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml",
"infrastructure/core/ntp-sync-daemonset.yaml",
"infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml",
"infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml",
"infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml",
"infrastructure/longhorn/core/vault-sync-deployment.yaml",
"infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml",
"infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml",
"infrastructure/postgres/statefulset.yaml",
"infrastructure/vault-csi/vault-csi-provider.yaml",
"services/ai-llm/deployment.yaml",
"services/bstein-dev-home/backend-deployment.yaml",
"services/bstein-dev-home/chat-ai-gateway-deployment.yaml",
"services/bstein-dev-home/frontend-deployment.yaml",
"services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml",
"services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml",
"services/bstein-dev-home/vault-sync-deployment.yaml",
"services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml",
"services/comms/atlasbot-deployment.yaml",
"services/comms/coturn.yaml",
"services/comms/element-call-deployment.yaml",
"services/comms/guest-name-job.yaml",
"services/comms/guest-register-deployment.yaml",
"services/comms/livekit-token-deployment.yaml",
"services/comms/livekit.yaml",
"services/comms/mas-deployment.yaml",
"services/comms/oneoffs/bstein-force-leave-job.yaml",
"services/comms/oneoffs/comms-secrets-ensure-job.yaml",
"services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml",
"services/comms/oneoffs/mas-db-ensure-job.yaml",
"services/comms/oneoffs/mas-local-users-ensure-job.yaml",
"services/comms/oneoffs/othrys-kick-numeric-job.yaml",
"services/comms/oneoffs/synapse-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-signingkey-ensure-job.yaml",
"services/comms/oneoffs/synapse-user-seed-job.yaml",
"services/comms/pin-othrys-job.yaml",
"services/comms/reset-othrys-room-job.yaml",
"services/comms/seed-othrys-room.yaml",
"services/comms/vault-sync-deployment.yaml",
"services/comms/wellknown.yaml",
"services/crypto/monerod/deployment.yaml",
"services/crypto/wallet-monero-temp/deployment.yaml",
"services/crypto/xmr-miner/deployment.yaml",
"services/crypto/xmr-miner/vault-sync-deployment.yaml",
"services/crypto/xmr-miner/xmrig-daemonset.yaml",
"services/finance/actual-budget-deployment.yaml",
"services/finance/firefly-cronjob.yaml",
"services/finance/firefly-deployment.yaml",
"services/finance/firefly-user-sync-cronjob.yaml",
"services/finance/oneoffs/finance-secrets-ensure-job.yaml",
"services/gitea/deployment.yaml",
"services/harbor/vault-sync-deployment.yaml",
"services/health/wger-admin-ensure-cronjob.yaml",
"services/health/wger-deployment.yaml",
"services/health/wger-user-sync-cronjob.yaml",
"services/jellyfin/deployment.yaml",
"services/jellyfin/loader.yaml",
"services/jenkins/deployment.yaml",
"services/jenkins/vault-sync-deployment.yaml",
"services/keycloak/deployment.yaml",
"services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/ldap-federation-job.yaml",
"services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/mas-secrets-ensure-job.yaml",
"services/keycloak/oneoffs/metis-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/metis-ssh-keys-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-e2e-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml",
"services/keycloak/oneoffs/portal-e2e-target-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml",
"services/keycloak/oneoffs/quality-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/realm-settings-job.yaml",
"services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/user-overrides-job.yaml",
"services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml",
"services/keycloak/vault-sync-deployment.yaml",
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/logging/oauth2-proxy.yaml",
"services/logging/oneoffs/opensearch-dashboards-setup-job.yaml",
"services/logging/oneoffs/opensearch-ism-job.yaml",
"services/logging/oneoffs/opensearch-observability-setup-job.yaml",
"services/logging/opensearch-prune-cronjob.yaml",
"services/logging/vault-sync-deployment.yaml",
"services/mailu/mailu-sync-cronjob.yaml",
"services/mailu/mailu-sync-listener.yaml",
"services/mailu/oneoffs/mailu-sync-job.yaml",
"services/mailu/vault-sync-deployment.yaml",
"services/mailu/vip-controller.yaml",
"services/maintenance/ariadne-deployment.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/metis-k3s-token-sync-cronjob.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oauth2-proxy-metis.yaml",
"services/maintenance/oauth2-proxy-soteria.yaml",
"services/maintenance/oneoffs/ariadne-migrate-job.yaml",
"services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml",
"services/maintenance/pod-cleaner-cronjob.yaml",
"services/maintenance/soteria-deployment.yaml",
"services/maintenance/vault-sync-deployment.yaml",
"services/monitoring/dcgm-exporter.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml",
"services/monitoring/oneoffs/grafana-org-bootstrap.yaml",
"services/monitoring/oneoffs/grafana-user-dedupe-job.yaml",
"services/monitoring/platform-quality-gateway-deployment.yaml",
"services/monitoring/platform-quality-suite-probe-cronjob.yaml",
"services/monitoring/postmark-exporter-deployment.yaml",
"services/monitoring/vault-sync-deployment.yaml",
"services/nextcloud-mail-sync/cronjob.yaml",
"services/nextcloud/collabora.yaml",
"services/nextcloud/cronjob.yaml",
"services/nextcloud/deployment.yaml",
"services/nextcloud/maintenance-cronjob.yaml",
"services/oauth2-proxy/deployment.yaml",
"services/openldap/statefulset.yaml",
"services/outline/deployment.yaml",
"services/outline/redis-deployment.yaml",
"services/pegasus/deployment.yaml",
"services/pegasus/vault-sync-deployment.yaml",
"services/planka/deployment.yaml",
"services/quality/oauth2-proxy-sonarqube.yaml",
"services/quality/sonarqube-deployment.yaml",
"services/quality/sonarqube-exporter-deployment.yaml",
"services/sui-metrics/base/deployment.yaml",
"services/typhon/vault-sync-deployment.yaml",
"services/vault/k8s-auth-config-cronjob.yaml",
"services/vault/oidc-config-cronjob.yaml",
"services/vault/statefulset.yaml",
"services/vaultwarden/deployment.yaml"
]
},
{
"id": "KSV-0017",
"targets": [
"infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml",
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml",
"services/monitoring/dcgm-exporter.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml"
]
},
{
"id": "KSV-0041",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml",
"infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml",
"infrastructure/traefik/clusterrole.yaml",
"services/bstein-dev-home/rbac.yaml",
"services/comms/comms-secrets-ensure-rbac.yaml",
"services/comms/mas-db-ensure-rbac.yaml",
"services/comms/mas-secrets-ensure-rbac.yaml",
"services/maintenance/soteria-rbac.yaml"
]
},
{
"id": "KSV-0047",
"targets": [
"services/monitoring/rbac.yaml"
]
},
{
"id": "KSV-0053",
"targets": [
"services/comms/comms-secrets-ensure-rbac.yaml",
"services/comms/mas-db-ensure-rbac.yaml",
"services/jenkins/serviceaccount.yaml",
"services/maintenance/ariadne-rbac.yaml"
]
},
{
"id": "KSV-0056",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml",
"infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml",
"services/jenkins/serviceaccount.yaml",
"services/maintenance/disable-k3s-traefik-rbac.yaml",
"services/maintenance/k3s-traefik-cleanup-rbac.yaml"
]
},
{
"id": "KSV-0114",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml"
]
},
{
"id": "KSV-0118",
"targets": [
"infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml",
"infrastructure/core/coredns-deployment.yaml",
"infrastructure/core/ntp-sync-daemonset.yaml",
"infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml",
"infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml",
"infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml",
"infrastructure/longhorn/core/vault-sync-deployment.yaml",
"infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml",
"infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml",
"infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml",
"infrastructure/postgres/statefulset.yaml",
"infrastructure/vault-csi/vault-csi-provider.yaml",
"services/ai-llm/deployment.yaml",
"services/bstein-dev-home/backend-deployment.yaml",
"services/bstein-dev-home/chat-ai-gateway-deployment.yaml",
"services/bstein-dev-home/frontend-deployment.yaml",
"services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml",
"services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml",
"services/bstein-dev-home/vault-sync-deployment.yaml",
"services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml",
"services/comms/atlasbot-deployment.yaml",
"services/comms/coturn.yaml",
"services/comms/element-call-deployment.yaml",
"services/comms/guest-name-job.yaml",
"services/comms/livekit-token-deployment.yaml",
"services/comms/livekit.yaml",
"services/comms/mas-deployment.yaml",
"services/comms/oneoffs/bstein-force-leave-job.yaml",
"services/comms/oneoffs/comms-secrets-ensure-job.yaml",
"services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml",
"services/comms/oneoffs/mas-db-ensure-job.yaml",
"services/comms/oneoffs/mas-local-users-ensure-job.yaml",
"services/comms/oneoffs/othrys-kick-numeric-job.yaml",
"services/comms/oneoffs/synapse-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml",
"services/comms/oneoffs/synapse-signingkey-ensure-job.yaml",
"services/comms/oneoffs/synapse-user-seed-job.yaml",
"services/comms/pin-othrys-job.yaml",
"services/comms/reset-othrys-room-job.yaml",
"services/comms/seed-othrys-room.yaml",
"services/comms/vault-sync-deployment.yaml",
"services/comms/wellknown.yaml",
"services/crypto/monerod/deployment.yaml",
"services/crypto/wallet-monero-temp/deployment.yaml",
"services/crypto/xmr-miner/deployment.yaml",
"services/crypto/xmr-miner/vault-sync-deployment.yaml",
"services/crypto/xmr-miner/xmrig-daemonset.yaml",
"services/finance/firefly-cronjob.yaml",
"services/finance/firefly-deployment.yaml",
"services/finance/firefly-user-sync-cronjob.yaml",
"services/finance/oneoffs/finance-secrets-ensure-job.yaml",
"services/gitea/deployment.yaml",
"services/harbor/vault-sync-deployment.yaml",
"services/health/wger-admin-ensure-cronjob.yaml",
"services/health/wger-deployment.yaml",
"services/health/wger-user-sync-cronjob.yaml",
"services/jellyfin/loader.yaml",
"services/jenkins/deployment.yaml",
"services/jenkins/vault-sync-deployment.yaml",
"services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/ldap-federation-job.yaml",
"services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/mas-secrets-ensure-job.yaml",
"services/keycloak/oneoffs/metis-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/metis-ssh-keys-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml",
"services/keycloak/oneoffs/portal-e2e-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml",
"services/keycloak/oneoffs/portal-e2e-target-client-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml",
"services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml",
"services/keycloak/oneoffs/quality-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/realm-settings-job.yaml",
"services/keycloak/oneoffs/soteria-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml",
"services/keycloak/oneoffs/user-overrides-job.yaml",
"services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml",
"services/keycloak/vault-sync-deployment.yaml",
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/logging/oauth2-proxy.yaml",
"services/logging/oneoffs/opensearch-dashboards-setup-job.yaml",
"services/logging/oneoffs/opensearch-ism-job.yaml",
"services/logging/oneoffs/opensearch-observability-setup-job.yaml",
"services/logging/opensearch-prune-cronjob.yaml",
"services/logging/vault-sync-deployment.yaml",
"services/mailu/mailu-sync-cronjob.yaml",
"services/mailu/mailu-sync-listener.yaml",
"services/mailu/oneoffs/mailu-sync-job.yaml",
"services/mailu/vault-sync-deployment.yaml",
"services/mailu/vip-controller.yaml",
"services/maintenance/ariadne-deployment.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/k3s-agent-restart-daemonset.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/metis-k3s-token-sync-cronjob.yaml",
"services/maintenance/metis-sentinel-amd64-daemonset.yaml",
"services/maintenance/metis-sentinel-arm64-daemonset.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oauth2-proxy-metis.yaml",
"services/maintenance/oauth2-proxy-soteria.yaml",
"services/maintenance/oneoffs/ariadne-migrate-job.yaml",
"services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml",
"services/maintenance/pod-cleaner-cronjob.yaml",
"services/maintenance/soteria-deployment.yaml",
"services/maintenance/vault-sync-deployment.yaml",
"services/monitoring/dcgm-exporter.yaml",
"services/monitoring/jetson-tegrastats-exporter.yaml",
"services/monitoring/oneoffs/grafana-org-bootstrap.yaml",
"services/monitoring/oneoffs/grafana-user-dedupe-job.yaml",
"services/monitoring/platform-quality-gateway-deployment.yaml",
"services/monitoring/platform-quality-suite-probe-cronjob.yaml",
"services/monitoring/postmark-exporter-deployment.yaml",
"services/monitoring/vault-sync-deployment.yaml",
"services/nextcloud/collabora.yaml",
"services/oauth2-proxy/deployment.yaml",
"services/openldap/statefulset.yaml",
"services/outline/deployment.yaml",
"services/outline/redis-deployment.yaml",
"services/pegasus/vault-sync-deployment.yaml",
"services/quality/oauth2-proxy-sonarqube.yaml",
"services/quality/sonarqube-deployment.yaml",
"services/quality/sonarqube-exporter-deployment.yaml",
"services/sui-metrics/base/deployment.yaml",
"services/sui-metrics/overlays/atlas/patch-node-selector.yaml",
"services/typhon/deployment.yaml",
"services/typhon/vault-sync-deployment.yaml",
"services/vault/k8s-auth-config-cronjob.yaml",
"services/vault/oidc-config-cronjob.yaml",
"services/vaultwarden/deployment.yaml"
]
},
{
"id": "KSV-0121",
"targets": [
"services/logging/node-image-gc-rpi4-daemonset.yaml",
"services/logging/node-image-prune-rpi5-daemonset.yaml",
"services/logging/node-log-rotation-daemonset.yaml",
"services/maintenance/disable-k3s-traefik-daemonset.yaml",
"services/maintenance/image-sweeper-cronjob.yaml",
"services/maintenance/metis-deployment.yaml",
"services/maintenance/node-image-sweeper-daemonset.yaml",
"services/maintenance/node-nofile-daemonset.yaml",
"services/maintenance/oneoffs/titan-24-rootfs-sweep-job.yaml"
]
}
]
}

View File

@ -13,14 +13,14 @@ spec:
git:
checkout:
ref:
branch: main
branch: feature/ariadne
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(bstein-dev-home): automated image update"
push:
branch: main
branch: feature/ariadne
update:
strategy: Setters
path: services/bstein-dev-home

View File

@ -13,8 +13,4 @@ spec:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: longhorn
- name: vault
- name: postgres
wait: true

View File

@ -16,6 +16,3 @@ spec:
wait: false
dependsOn:
- name: core
- name: longhorn
- name: vault
- name: postgres

View File

@ -25,4 +25,3 @@ spec:
name: jenkins
namespace: jenkins
wait: false
timeout: 20m

View File

@ -12,8 +12,4 @@ spec:
name: flux-system
path: ./services/keycloak
targetNamespace: sso
dependsOn:
- name: longhorn
- name: vault
- name: postgres
timeout: 2m

View File

@ -21,12 +21,10 @@ resources:
- sui-metrics/kustomization.yaml
- openldap/kustomization.yaml
- keycloak/kustomization.yaml
- quality/kustomization.yaml
- oauth2-proxy/kustomization.yaml
- mailu/kustomization.yaml
- jenkins/kustomization.yaml
- ai-llm/kustomization.yaml
- typhon/kustomization.yaml
- nextcloud/kustomization.yaml
- nextcloud-mail-sync/kustomization.yaml
- outline/kustomization.yaml

View File

@ -16,4 +16,4 @@ spec:
dependsOn:
- name: crypto
wait: true
timeout: 15m
timeout: 5m

View File

@ -1,35 +0,0 @@
# clusters/atlas/flux-system/applications/quality/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: quality
namespace: flux-system
spec:
interval: 10m
path: ./services/quality
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: quality
dependsOn:
- name: traefik
- name: cert-manager
- name: keycloak
- name: vault
- name: postgres
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: sonarqube
namespace: quality
- apiVersion: apps/v1
kind: Deployment
name: sonarqube-exporter
namespace: quality
- apiVersion: apps/v1
kind: Deployment
name: oauth2-proxy-sonarqube
namespace: quality
wait: false
timeout: 20m

View File

@ -1,29 +0,0 @@
# clusters/atlas/flux-system/applications/typhon/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: typhon
namespace: flux-system
spec:
interval: 10m
path: ./services/typhon
prune: true
sourceRef:
kind: GitRepository
name: flux-system
targetNamespace: climate
dependsOn:
- name: vault
- name: vault-csi
- name: monitoring
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: typhon
namespace: climate
- apiVersion: v1
kind: Service
name: typhon
namespace: climate
wait: false
timeout: 20m

View File

@ -15,5 +15,4 @@ spec:
prune: true
wait: true
dependsOn:
- name: longhorn
- name: helm

View File

@ -17,4 +17,3 @@ spec:
- name: crypto
- name: monerod
wait: true
timeout: 30m

View File

@ -9,7 +9,7 @@ metadata:
spec:
interval: 1m0s
ref:
branch: main
branch: feature/ariadne
secretRef:
name: flux-system-gitea
url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git

View File

@ -13,14 +13,14 @@ spec:
git:
checkout:
ref:
branch: main
branch: feature/ariadne
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(maintenance): automated image update"
push:
branch: main
branch: feature/ariadne
update:
strategy: Setters
path: services/maintenance

View File

@ -14,7 +14,6 @@ spec:
name: flux-system
targetNamespace: postgres
dependsOn:
- name: longhorn
- name: vault
- name: vault-csi
healthChecks:

View File

@ -1,12 +0,0 @@
FROM debian:bookworm-slim
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
bash \
ca-certificates \
curl \
util-linux \
zstd \
&& rm -rf /var/lib/apt/lists/*
CMD ["/bin/sh"]

View File

@ -2,8 +2,4 @@ FROM python:3.11-slim
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
RUN pip install --no-cache-dir requests psycopg2-binary \
&& groupadd --system guest-tools \
&& useradd --system --uid 65532 --gid guest-tools --home-dir /nonexistent --shell /usr/sbin/nologin guest-tools
USER guest-tools
RUN pip install --no-cache-dir requests psycopg2-binary

View File

@ -1,8 +1,16 @@
# Use the mirrored Harbor artifact so CI does not depend on Docker Hub egress.
FROM registry.bstein.dev/streaming/data-prepper@sha256:32ac6ad42e0f12da08bebee307e290b17d127b30def9b06eeaffbcbbc5033e83
FROM --platform=$BUILDPLATFORM opensearchproject/data-prepper:2.8.0 AS source
FROM --platform=$TARGETPLATFORM eclipse-temurin:17-jre
ENV DATA_PREPPER_PATH=/usr/share/data-prepper
RUN useradd -u 10001 -M -U -d / -s /usr/sbin/nologin data_prepper \
&& mkdir -p /var/log/data-prepper
COPY --from=source /usr/share/data-prepper /usr/share/data-prepper
RUN chown -R 10001:10001 /usr/share/data-prepper /var/log/data-prepper
USER 10001
WORKDIR /usr/share/data-prepper
CMD ["bin/data-prepper"]

View File

@ -1,13 +1,10 @@
FROM ghcr.io/element-hq/lk-jwt-service:0.3.0 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates \
&& addgroup -S livekit-token \
&& adduser -S -D -H -u 65532 -G livekit-token livekit-token
RUN apk add --no-cache ca-certificates
COPY --from=base /lk-jwt-service /lk-jwt-service
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER livekit-token
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/lk-jwt-service"]

View File

@ -29,12 +29,10 @@ FROM ${DEBIAN_IMAGE}
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends ca-certificates; \
update-ca-certificates; rm -rf /var/lib/apt/lists/*; \
groupadd --system p2pool; \
useradd --system --uid 65532 --gid p2pool --home-dir /nonexistent --shell /usr/sbin/nologin p2pool
update-ca-certificates; rm -rf /var/lib/apt/lists/*
COPY --from=fetch /out/p2pool /usr/local/bin/p2pool
RUN /usr/local/bin/p2pool --version || true
EXPOSE 3333
USER p2pool
ENTRYPOINT ["/usr/local/bin/p2pool"]

View File

@ -26,12 +26,9 @@ RUN set -eux; \
curl -fsSL "$URL" -o /opt/monero/monero.tar.bz2; \
tar -xjf /opt/monero/monero.tar.bz2 -C /opt/monero --strip-components=1; \
install -m 0755 /opt/monero/monero-wallet-rpc /usr/local/bin/monero-wallet-rpc; \
rm -f /opt/monero/monero.tar.bz2; \
groupadd --system monero; \
useradd --system --uid 1000 --gid monero --home-dir /nonexistent --shell /usr/sbin/nologin monero
rm -f /opt/monero/monero.tar.bz2
ENV PATH="/usr/local/bin:/usr/bin:/bin"
RUN /usr/local/bin/monero-wallet-rpc --version || true
EXPOSE 18083
USER monero

View File

@ -23,14 +23,10 @@ RUN set -eux; \
mkdir -p /opt/monero; \
tar -xjf /tmp/monero.tar.bz2 -C /opt/monero --strip-components=1; \
rm -f /tmp/monero.tar.bz2; \
groupadd --system monero; \
useradd --system --uid 1000 --gid monero --home-dir /nonexistent --shell /usr/sbin/nologin monero; \
mkdir -p /data; \
chown monero:monero /data; \
chmod 0770 /data
ENV LD_LIBRARY_PATH=/opt/monero:/opt/monero/lib \
PATH="/opt/monero:${PATH}"
USER monero
CMD ["/opt/monero/monerod", "--version"]

View File

@ -1,13 +1,10 @@
FROM quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates \
&& addgroup -S oauth2-proxy \
&& adduser -S -D -H -u 65532 -G oauth2-proxy oauth2-proxy
RUN apk add --no-cache ca-certificates
COPY --from=base /bin/oauth2-proxy /bin/oauth2-proxy
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER oauth2-proxy
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/bin/oauth2-proxy"]

View File

@ -1,13 +1,10 @@
FROM registry.bstein.dev/streaming/pegasus:1.2.32 AS base
FROM alpine:3.20
RUN apk add --no-cache ca-certificates \
&& addgroup -S pegasus \
&& adduser -S -D -H -u 65532 -G pegasus pegasus
RUN apk add --no-cache ca-certificates
COPY --from=base /pegasus /pegasus
COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
RUN chmod 0755 /entrypoint.sh
USER pegasus
ENTRYPOINT ["/entrypoint.sh"]
CMD ["/pegasus"]

View File

@ -1,48 +0,0 @@
# dockerfiles/Dockerfile.quality-tools
FROM debian:bookworm-slim
ARG SONAR_SCANNER_VERSION=8.0.1.6346
ARG TRIVY_VERSION=0.70.0
ENV TRIVY_CACHE_DIR=/opt/trivy-cache
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
bash \
ca-certificates \
curl \
git \
jq \
unzip \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd --system quality-tools \
&& useradd --system --uid 65532 --gid quality-tools --home-dir /nonexistent --shell /usr/sbin/nologin quality-tools
RUN set -eux; \
scanner_zip="sonar-scanner-cli-${SONAR_SCANNER_VERSION}-linux-aarch64.zip"; \
base_url="https://binaries.sonarsource.com/Distribution/sonar-scanner-cli"; \
curl -fsSL "${base_url}/${scanner_zip}" -o "/tmp/${scanner_zip}"; \
curl -fsSL "${base_url}/${scanner_zip}.sha256" -o "/tmp/${scanner_zip}.sha256"; \
printf '%s %s\n' "$(cat "/tmp/${scanner_zip}.sha256")" "/tmp/${scanner_zip}" | sha256sum -c -; \
unzip -q "/tmp/${scanner_zip}" -d /opt; \
ln -s "/opt/sonar-scanner-${SONAR_SCANNER_VERSION}-linux-aarch64/bin/sonar-scanner" /usr/local/bin/sonar-scanner; \
rm -f "/tmp/${scanner_zip}" "/tmp/${scanner_zip}.sha256"
RUN set -eux; \
trivy_tgz="trivy_${TRIVY_VERSION}_Linux-ARM64.tar.gz"; \
curl -fsSL "https://github.com/aquasecurity/trivy/releases/download/v${TRIVY_VERSION}/${trivy_tgz}" -o "/tmp/${trivy_tgz}"; \
tar -C /usr/local/bin -xzf "/tmp/${trivy_tgz}" trivy; \
rm -f "/tmp/${trivy_tgz}"; \
trivy --version; \
sonar-scanner -v
RUN set -eux; \
mkdir -p "${TRIVY_CACHE_DIR}"; \
trivy image --download-db-only --cache-dir "${TRIVY_CACHE_DIR}"; \
chmod -R a+rX "${TRIVY_CACHE_DIR}"; \
mkdir -p /workspace; \
chown quality-tools:quality-tools /workspace
WORKDIR /workspace
USER quality-tools

View File

@ -27,42 +27,10 @@ spec:
timeout: 10m
values:
installCRDs: true
extraArgs:
- --acme-http01-solver-nameservers=1.1.1.1:53,8.8.8.8:53
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
@ -76,36 +44,6 @@ spec:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
@ -119,36 +57,6 @@ spec:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:

View File

@ -26,7 +26,7 @@ spec:
spec:
containers:
- name: coredns
image: registry.k8s.io/coredns/coredns:v1.12.1
image: registry.bstein.dev/infra/coredns:1.12.1
imagePullPolicy: IfNotPresent
args:
- -conf

View File

@ -26,9 +26,6 @@ spec:
cleanupOnFail: true
timeout: 15m
values:
global:
nodeSelector:
longhorn-host: "true"
service:
ui:
type: NodePort
@ -81,12 +78,3 @@ spec:
tag: v2.16.0
defaultSettings:
systemManagedPodsImagePullPolicy: Always
longhornManager:
nodeSelector:
longhorn-host: "true"
longhornDriver:
nodeSelector:
longhorn-host: "true"
longhornUI:
nodeSelector:
longhorn-host: "true"

View File

@ -8,15 +8,11 @@ resources:
- vault-sync-deployment.yaml
- helmrelease.yaml
- longhorn-settings-ensure-job.yaml
- longhorn-disk-tags-ensure-job.yaml
configMapGenerator:
- name: longhorn-settings-ensure-script
files:
- longhorn_settings_ensure.sh=scripts/longhorn_settings_ensure.sh
- name: longhorn-disk-tags-ensure-script
files:
- longhorn_disk_tags_ensure.py=scripts/longhorn_disk_tags_ensure.py
generatorOptions:
disableNameSuffixHash: true

View File

@ -1,36 +0,0 @@
# infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-disk-tags-ensure-1
namespace: longhorn-system
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
spec:
serviceAccountName: longhorn-service-account
restartPolicy: Never
volumes:
- name: longhorn-disk-tags-ensure-script
configMap:
name: longhorn-disk-tags-ensure-script
defaultMode: 0555
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
- key: node-role.kubernetes.io/worker
operator: Exists
containers:
- name: apply
image: python:3.12.9-alpine3.20
command: ["python", "/scripts/longhorn_disk_tags_ensure.py"]
volumeMounts:
- name: longhorn-disk-tags-ensure-script
mountPath: /scripts
readOnly: true

View File

@ -2,11 +2,10 @@
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-settings-ensure-7
name: longhorn-settings-ensure-4
namespace: longhorn-system
spec:
backoffLimit: 0
activeDeadlineSeconds: 240
ttlSecondsAfterFinished: 3600
template:
spec:

View File

@ -1,100 +0,0 @@
#!/usr/bin/env python3
"""Reconcile Longhorn disk tags for the Titan longhorn storage classes.
The astreae/asteria storageclasses select Longhorn disks by tag. The current
nodes already have the right disk paths, but the tag fields can drift to empty
after node recovery. This job patches the live Longhorn Node CRs back to the
expected tags so PVC provisioning keeps working.
"""
from __future__ import annotations
import json
import os
import ssl
import urllib.request
LONGHORN_NS = "longhorn-system"
LONGHORN_API = "/apis/longhorn.io/v1beta2/namespaces/{namespace}/nodes"
DESIRED_TAGS = {
"/mnt/astreae": "astreae",
"/mnt/asteria": "asteria",
}
def api_base() -> str:
host = os.environ.get("KUBERNETES_SERVICE_HOST")
port = os.environ.get("KUBERNETES_SERVICE_PORT", "443")
if not host:
raise SystemExit("missing KUBERNETES_SERVICE_HOST")
return f"https://{host}:{port}"
def token() -> str:
path = "/var/run/secrets/kubernetes.io/serviceaccount/token"
with open(path, "r", encoding="utf-8") as fh:
return fh.read().strip()
def ca_context() -> ssl.SSLContext:
cafile = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
return ssl.create_default_context(cafile=cafile)
def request_json(method: str, path: str, body: dict | None = None) -> dict:
req = urllib.request.Request(
f"{api_base()}{path}",
method=method,
headers={
"Authorization": f"Bearer {token()}",
"Content-Type": "application/merge-patch+json",
"Accept": "application/json",
},
data=None if body is None else json.dumps(body).encode("utf-8"),
)
with urllib.request.urlopen(req, context=ca_context(), timeout=20) as resp:
payload = resp.read()
return json.loads(payload) if payload else {}
def list_nodes() -> list[dict]:
data = request_json("GET", LONGHORN_API.format(namespace=LONGHORN_NS))
return data.get("items", [])
def patch_disk_tags(node_name: str, disk_name: str, desired_tag: str) -> None:
body = {"spec": {"disks": {disk_name: {"tags": [desired_tag]}}}}
request_json(
"PATCH",
f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}",
body=body,
)
def main() -> int:
changed = 0
skipped = 0
for node in list_nodes():
name = node.get("metadata", {}).get("name", "")
spec_disks = node.get("spec", {}).get("disks", {}) or {}
for disk_name, disk in spec_disks.items():
disk_path = disk.get("path")
desired_tag = DESIRED_TAGS.get(disk_path)
if not desired_tag:
continue
current_tags = disk.get("tags") or []
if current_tags == [desired_tag]:
skipped += 1
continue
print(f"patching {name}:{disk_name} path={disk_path} tags={current_tags!r} -> {[desired_tag]!r}")
patch_disk_tags(name, disk_name, desired_tag)
changed += 1
print(f"done: changed={changed} skipped={skipped}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -4,12 +4,11 @@ set -eu
# Longhorn blocks direct CR patches for some settings; use the internal API instead.
api_base="http://longhorn-backend.longhorn-system.svc:9500/v1/settings"
curl_opts="-fsS --connect-timeout 3 --max-time 15"
wait_for_api() {
attempts=30
while [ "${attempts}" -gt 0 ]; do
if curl ${curl_opts} "${api_base}" >/dev/null 2>&1; then
if curl -fsS "${api_base}" >/dev/null 2>&1; then
return 0
fi
attempts=$((attempts - 1))
@ -23,14 +22,14 @@ update_setting() {
name="$1"
value="$2"
current="$(curl ${curl_opts} "${api_base}/${name}" || true)"
current="$(curl -fsS "${api_base}/${name}" || true)"
if echo "${current}" | grep -Fq "\"value\":\"${value}\""; then
echo "Setting ${name} already set."
return 0
fi
echo "Setting ${name} -> ${value}"
curl ${curl_opts} -X PUT \
curl -fsS -X PUT \
-H "Content-Type: application/json" \
-d "{\"value\":\"${value}\"}" \
"${api_base}/${name}" >/dev/null
@ -41,7 +40,3 @@ update_setting default-engine-image "registry.bstein.dev/infra/longhorn-engine:v
update_setting default-instance-manager-image "registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2"
update_setting default-backing-image-manager-image "registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2"
update_setting support-bundle-manager-image "registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56"
# Keep storage-heavy nodes from getting hammered by rebuild storms and skew.
update_setting replica-auto-balance "best-effort"
update_setting concurrent-replica-rebuild-per-node-limit "2"
update_setting node-down-pod-deletion-policy "delete-both-statefulset-and-deployment-pod"

View File

@ -13,27 +13,9 @@ spec:
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
- objectName: "longhorn-backup-b2__AWS_ACCESS_KEY_ID"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ACCESS_KEY_ID"
- objectName: "longhorn-backup-b2__AWS_SECRET_ACCESS_KEY"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_SECRET_ACCESS_KEY"
- objectName: "longhorn-backup-b2__AWS_ENDPOINTS"
secretPath: "kv/data/atlas/longhorn/backup-b2"
secretKey: "AWS_ENDPOINTS"
secretObjects:
- secretName: longhorn-registry
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson
- secretName: longhorn-backup-b2
type: Opaque
data:
- objectName: longhorn-backup-b2__AWS_ACCESS_KEY_ID
key: AWS_ACCESS_KEY_ID
- objectName: longhorn-backup-b2__AWS_SECRET_ACCESS_KEY
key: AWS_SECRET_ACCESS_KEY
- objectName: longhorn-backup-b2__AWS_ENDPOINTS
key: AWS_ENDPOINTS

View File

@ -26,16 +26,6 @@ spec:
- key: hardware
operator: In
values: ["rpi5", "rpi4"]
- weight: 90
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
containers:
- name: sync
image: alpine:3.20

View File

@ -78,7 +78,6 @@ spec:
- --upstream=http://longhorn-frontend.longhorn-system.svc.cluster.local
- --http-address=0.0.0.0:4180
- --skip-provider-button=true
- --approval-prompt=auto
- --skip-jwt-bearer-tokens=true
- --oidc-groups-claim=groups
- --cookie-domain=longhorn.bstein.dev

View File

@ -2,7 +2,7 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: atlas-traefik-ingress-controller
name: traefik-ingress-controller
rules:
- apiGroups:
- ""

View File

@ -2,12 +2,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: atlas-traefik-ingress-controller
name: traefik-ingress-controller
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: atlas-traefik-ingress-controller
name: traefik-ingress-controller
subjects:
- kind: ServiceAccount
name: atlas-traefik-ingress-controller
name: traefik-ingress-controller
namespace: traefik

View File

@ -70,42 +70,10 @@ items:
dnsPolicy: ClusterFirst
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi4
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: atlas-traefik-ingress-controller
serviceAccountName: atlas-traefik-ingress-controller
serviceAccount: traefik-ingress-controller
serviceAccountName: traefik-ingress-controller
terminationGracePeriodSeconds: 30
kind: List
metadata: {}

View File

@ -1,9 +0,0 @@
# infrastructure/traefik/ingressclass.yaml
apiVersion: networking.k8s.io/v1
kind: IngressClass
metadata:
name: traefik
annotations:
ingressclass.kubernetes.io/is-default-class: "true"
spec:
controller: traefik.io/ingress-controller

View File

@ -6,7 +6,6 @@ metadata:
namespace: flux-system
resources:
- crds.yaml
- ingressclass.yaml
- deployment.yaml
- serviceaccount.yaml
- clusterrole.yaml

View File

@ -2,5 +2,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: atlas-traefik-ingress-controller
name: traefik-ingress-controller
namespace: traefik

View File

@ -41,12 +41,3 @@ spec:
failurePolicy: Ignore
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13", "titan-15", "titan-17", "titan-19"]

View File

@ -1,152 +0,0 @@
Atlas Cluster Power Recovery (Graceful Shutdown/Startup)
Purpose
- Provide a safe operator flow for planned power events and cold-boot recovery.
- Avoid the Flux/Gitea bootstrap deadlock by using a local bootstrap fallback path.
- Break the Harbor self-hosting deadlock by seeding Harbor runtime images from a control-host bundle.
- Refuse bootstrap when UPS charge is too low, and fall back to fast shutdown if a second outage hits mid-recovery.
Bootstrapping risk to remember
- Flux source is Git over SSH to `scm.bstein.dev` (Gitea).
- Gitea itself is a Flux-managed workload and depends on storage + database.
- Harbor is also critical, but it is not part of the first recovery stage because Harbor serves its own runtime images.
- On cold boot, if Flux cannot fetch source before Gitea is up, reconciliation can stall.
- Recovery path: bring control plane and workers up, then locally apply minimal platform stack (`core -> helm -> longhorn -> metallb -> traefik -> vault-csi -> vault-injector -> vault -> postgres -> gitea`), then seed Harbor images onto the Harbor node from a control-host bundle, then resume/reconcile Flux. Harbor is a later recovery stage after storage, Vault, Postgres, and Gitea are back.
Script
- `scripts/cluster_power_recovery.sh`
- `scripts/cluster_power_console.sh`
- Modes:
- `prepare`
- `shutdown`
- `harbor-seed`
- `startup`
- `status`
- Default is dry-run. Add `--execute` to actually perform actions.
Dry-run examples
- Shutdown preview:
- `scripts/cluster_power_recovery.sh shutdown --skip-etcd-snapshot --skip-drain`
- Startup preview:
- `scripts/cluster_power_recovery.sh startup`
- Harbor seed preview:
- `scripts/cluster_power_recovery.sh harbor-seed`
Execute examples
- Prepare helper image on every node:
- `scripts/cluster_power_recovery.sh prepare --execute`
- Seed Harbor runtime images onto `titan-05` from the control-host bundle:
- `scripts/cluster_power_recovery.sh harbor-seed --execute`
- Planned shutdown:
- `scripts/cluster_power_recovery.sh shutdown --execute`
- Planned startup (canonical branch):
- `scripts/cluster_power_recovery.sh startup --execute --force-flux-branch main`
Manual remote console examples
- Canonical operator hosts:
- `titan-db`
- `tethys` (`titan-24`)
- Both hosts now have:
- `~/ananke-tools/cluster_power_recovery.sh`
- `~/ananke-tools/cluster_power_console.sh`
- `~/ananke-tools/bootstrap/recovery-config.env`
- `~/ananke-tools/bootstrap/harbor-bootstrap-images.txt`
- `~/ananke-tools/kubeconfig`
- `~/ananke-cluster-power`
- `~/bin/ananke-cluster-power`
- `~/ananke-repo/{infrastructure,services,scripts}`
- Both hosts also keep the Harbor bootstrap bundle at:
- `~/.local/share/ananke/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`
- Remote usage:
- `ssh titan-db`
- `~/ananke-cluster-power status`
- `~/ananke-cluster-power prepare --execute`
- `~/ananke-cluster-power shutdown --execute`
- `~/ananke-cluster-power startup --execute --force-flux-branch main`
- `ssh tethys`
- `~/ananke-cluster-power status`
- `~/ananke-cluster-power prepare --execute`
- `~/ananke-cluster-power shutdown --execute`
- `~/ananke-cluster-power startup --execute --force-flux-branch main`
Useful options
- `--shutdown-mode host-poweroff|cluster-only`
- `--expected-flux-branch main`
- `--expected-flux-url ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git`
- `--force-flux-url ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git`
- `--force-flux-branch main`
- `--allow-flux-source-mutation` (required with `--force-flux-url`; breakglass only)
- `--skip-local-bootstrap` (not recommended for cold-start recovery)
- `--skip-harbor-bootstrap` (skip the Harbor recovery stage if you know Harbor should stay deferred)
- `--skip-harbor-seed` (skip bundle import if Harbor images are already cached on the target node)
- `--skip-helper-prewarm`
- `--min-startup-battery 35`
- `--ups-host pyrphoros@localhost`
- `--require-ups-battery`
- `--drain-timeout 180`
- `--emergency-drain-timeout 45`
- `--flux-ready-timeout 1200`
- `--startup-checklist-timeout 900`
- `--startup-stability-window 180`
- `--startup-stability-timeout 900`
- `--recovery-state-file ~/.local/share/ananke/cluster_power_recovery.state`
- `--harbor-bundle-file ~/.local/share/ananke/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`
Controlled drill checklist (recommended)
- Operator host: use `titan-db` as canonical control host for the drill.
- On-site coordination:
- Have on-site operator ready before shutdown starts.
- Confirm they will manually power cluster nodes back on after shutdown completes.
- Confirm who will announce "all nodes powered on" to resume startup.
- Preflight on `titan-db`:
- `mkdir -p ~/ananke-logs`
- `~/ananke-cluster-power status` and verify:
- `ups_host=pyrphoros@localhost`
- `ups_battery` is numeric
- `flux_source_ready=True`
- Warm helper image just before shutdown:
- `~/ananke-cluster-power prepare --execute`
- Run in a persistent shell and capture logs:
- `tmux new -s ananke-drill`
- `script -q -a ~/ananke-logs/ananke-drill-$(date +%Y%m%d-%H%M%S).log`
- Execute controlled shutdown with telemetry enforcement:
- `~/ananke-cluster-power shutdown --execute --require-ups-battery`
- After on-site power-on confirmation, execute startup:
- `~/ananke-cluster-power startup --execute --force-flux-branch main --require-ups-battery`
- Post-check:
- `~/ananke-cluster-power status`
- Verify critical services (`longhorn`, `vault`, `postgres`, `gitea`, `harbor`, `pegasus`) and no widespread pull/crash failures.
Operational notes
- The flow suspends Flux Kustomizations/HelmReleases during shutdown to prevent churn.
- Shutdown behavior is explicit:
- `host-poweroff` schedules host poweroff after service stop.
- `cluster-only` stops `k3s`/`k3s-agent` without powering hosts off.
- Worker drain is no longer best-effort only. The script now escalates from normal drain, to `--force`, to `--disable-eviction` once the configured timeout is exhausted.
- Startup fails fast if Flux source URL/branch drift from expected values (unless branch override is explicitly requested with `--force-flux-branch`).
- Flux desired-state source remains `titan-iac.git`. Ananke orchestrates runtime recovery and should not be used as the normal Flux source repo.
- During startup, if Flux source is not `Ready`, local bootstrap fallback is applied first using the repo snapshot under `~/ananke-repo`.
- Longhorn is reconciled before Vault/Postgres/Gitea so storage-backed services are not racing the volume layer.
- Harbor is reconciled after the first critical stateful services.
- Harbor bootstrap is now designed around a control-host bundle:
- Build the Harbor bundle locally with `scripts/build_harbor_bootstrap_bundle.sh`.
- Stage it on the operator host at `~/.local/share/ananke/bundles/harbor-bootstrap-v2.14.1-arm64.tar.zst`.
- Use `harbor-seed --execute` or a full `startup --execute` to stream/import that bundle onto `titan-05`.
- The Harbor bundle remains arm64-only because Harbor is pinned to arm64 nodes. The node-helper image is multi-arch because Ananke uses it across both arm64 and amd64 nodes during prepare/shutdown operations.
- Ananke uses a temporary privileged helper pod for host-side operations. The helper image is prewarmed with `prepare --execute` so later shutdown/startup steps do not stall on image pulls.
- The script persists outage state in `~/.local/share/ananke/cluster_power_recovery.state` by default. If startup is attempted during an outage window and power becomes unstable again, rerunning startup with insufficient UPS charge will flip into the emergency shutdown path instead of continuing to bootstrap.
- Startup completion is strict now:
- all non-optional Flux kustomizations must be `Ready=True`
- external service checklist must pass (defaults include Gitea, Grafana, Harbor)
- generated ingress reachability checks must pass (default accepted codes: `200,301,302,307,308,401,403,404`)
- stability soak must pass with no crashloop/pull-failure churn
- If Flux hits immutable one-off Job drift during reconcile, Ananke now attempts self-heal by pruning failed Flux-managed Jobs and retrying reconcile.
- In dry-run mode, the script now skips the live API wait step so preview runs do not stall on an offline cluster.
- Dry-run mode no longer mutates outage recovery state.
- `harbor-seed --execute` was validated by:
- prewarming the helper image across all nodes
- streaming the Harbor bootstrap bundle to `titan-05`
- importing Harbor runtime images into host `containerd`
- successfully running a Harbor-backed canary pod (`harbor-canary-ok`)
- After bootstrap, Flux resources are resumed and reconciled.
- Keep this runbook aligned with `clusters/atlas/flux-system/gotk-sync.yaml`.

View File

@ -1,3 +0,0 @@
[pytest]
addopts = -ra
norecursedirs = .git .venv .venv-ci __pycache__ tmp

View File

@ -1,9 +0,0 @@
# Harbor cold-start bootstrap images.
registry.bstein.dev/infra/harbor-core:v2.14.1-arm64
registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64
registry.bstein.dev/infra/harbor-portal:v2.14.1-arm64
registry.bstein.dev/infra/harbor-registry:v2.14.1-arm64
registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64
registry.bstein.dev/infra/harbor-redis:v2.14.1-arm64
registry.bstein.dev/infra/harbor-nginx:v2.14.1-arm64
registry.bstein.dev/infra/harbor-prepare:v2.14.1-arm64

View File

@ -1,36 +0,0 @@
CANONICAL_CONTROL_HOST="titan-db"
DEFAULT_FLUX_BRANCH="main"
EXPECTED_FLUX_URL="ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
SHUTDOWN_MODE="host-poweroff"
STATE_SUBDIR=".local/share/ananke"
HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst"
HARBOR_TARGET_NODE=""
HARBOR_CANARY_NODE=""
HARBOR_HOST_LABEL_KEY="ananke.bstein.dev/harbor-bootstrap"
HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0"
NODE_HELPER_IMAGE="registry.bstein.dev/bstein/ananke-node-helper:0.1.0"
NODE_HELPER_NAMESPACE="maintenance"
NODE_HELPER_SERVICE_ACCOUNT="default"
REGISTRY_PULL_SECRET="harbor-regcred"
BUNDLE_HTTP_PORT="8877"
UPS_HOST="pyrphoros@localhost"
UPS_BATTERY_KEY="battery.charge"
FLUX_READY_TIMEOUT_SECONDS="1200"
FLUX_READY_POLL_SECONDS="10"
STARTUP_CHECKLIST_TIMEOUT_SECONDS="900"
STARTUP_CHECKLIST_POLL_SECONDS="10"
STARTUP_WORKLOAD_TIMEOUT_SECONDS="900"
STARTUP_WORKLOAD_POLL_SECONDS="10"
STARTUP_STABILITY_WINDOW_SECONDS="180"
STARTUP_STABILITY_TIMEOUT_SECONDS="900"
STARTUP_STABILITY_POLL_SECONDS="10"
STARTUP_OPTIONAL_KUSTOMIZATIONS=""
STARTUP_IGNORE_PODS_REGEX=""
STARTUP_IGNORE_WORKLOADS_REGEX=""
STARTUP_WORKLOAD_NAMESPACE_EXCLUDES_REGEX="^(kube-system|kube-public|kube-node-lease|flux-system)$"
STARTUP_SERVICE_CHECK_TIMEOUT_SECONDS="10"
STARTUP_INCLUDE_INGRESS_CHECKS="1"
STARTUP_INGRESS_ALLOWED_STATUSES="200,301,302,307,308,401,403,404"
STARTUP_IGNORE_INGRESS_HOSTS_REGEX=""
STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="10"
STARTUP_SERVICE_CHECKLIST='gitea|https://scm.bstein.dev/api/healthz|200|"status":"pass"||;grafana|https://metrics.bstein.dev/api/health|200|"database":"ok"||;harbor|https://registry.bstein.dev/v2/|200,401|||'

View File

@ -1,56 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
IMAGE="registry.bstein.dev/bstein/ananke-node-helper:0.1.0"
DOCKER_CONFIG_PATH=""
PLATFORMS="linux/amd64,linux/arm64"
BUILDER_NAME="ananke-node-helper-builder"
while [[ $# -gt 0 ]]; do
case "$1" in
--image)
IMAGE="${2:?missing image}"
shift 2
;;
--docker-config)
DOCKER_CONFIG_PATH="${2:?missing docker config path}"
shift 2
;;
--platforms)
PLATFORMS="${2:?missing platforms}"
shift 2
;;
--builder)
BUILDER_NAME="${2:?missing builder}"
shift 2
;;
-h|--help)
cat <<USAGE
Usage: scripts/build_ananke_node_helper.sh [--image <image>] [--docker-config <path>] [--platforms <csv>] [--builder <name>]
USAGE
exit 0
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
if [[ -n "${DOCKER_CONFIG_PATH}" ]]; then
export DOCKER_CONFIG="${DOCKER_CONFIG_PATH}"
fi
if ! docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use >/dev/null
else
docker buildx use "${BUILDER_NAME}" >/dev/null
fi
docker buildx inspect --bootstrap >/dev/null
docker buildx build \
--platform "${PLATFORMS}" \
-f dockerfiles/Dockerfile.ananke-node-helper \
-t "${IMAGE}" \
--push \
.

View File

@ -1,58 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
IMAGES_FILE="scripts/bootstrap/harbor-bootstrap-images.txt"
BUNDLE_FILE="artifacts/harbor-bootstrap-v2.14.1-arm64.tar.zst"
DOCKER_CONFIG_PATH=""
PLATFORM="linux/arm64"
while [[ $# -gt 0 ]]; do
case "$1" in
--images-file)
IMAGES_FILE="${2:?missing images file}"
shift 2
;;
--bundle-file)
BUNDLE_FILE="${2:?missing bundle file}"
shift 2
;;
--docker-config)
DOCKER_CONFIG_PATH="${2:?missing docker config path}"
shift 2
;;
--platform)
PLATFORM="${2:?missing platform}"
shift 2
;;
-h|--help)
cat <<USAGE
Usage: scripts/build_harbor_bootstrap_bundle.sh [--images-file <path>] [--bundle-file <path>] [--docker-config <path>] [--platform <linux/arm64>]
USAGE
exit 0
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
if [[ -n "${DOCKER_CONFIG_PATH}" ]]; then
export DOCKER_CONFIG="${DOCKER_CONFIG_PATH}"
fi
mapfile -t IMAGES < <(grep -v '^[[:space:]]*#' "${IMAGES_FILE}" | sed '/^[[:space:]]*$/d')
if [[ ${#IMAGES[@]} -eq 0 ]]; then
echo "No images found in ${IMAGES_FILE}" >&2
exit 1
fi
mkdir -p "$(dirname "${BUNDLE_FILE}")"
for image in "${IMAGES[@]}"; do
echo "Pulling ${image}" >&2
docker pull --platform "${PLATFORM}" "${image}" >/dev/null
done
docker save "${IMAGES[@]}" | zstd -T0 -19 -o "${BUNDLE_FILE}"
echo "Wrote ${BUNDLE_FILE}" >&2

View File

@ -1,87 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
usage() {
cat <<'USAGE'
Usage:
scripts/cluster_power_console.sh [--repo-dir <path>] [--delegate-host <host>] <shutdown|startup> [recovery-script-options...]
Purpose:
Friendly manual entrypoint for running Ananke from a remote console.
Canonical control host is titan-db by default so bundle/state handling stays in one place.
Defaults:
--repo-dir $HOME/Development/ananke (fallback: $HOME/Development/titan-iac)
--delegate-host titan-db
Examples:
scripts/cluster_power_console.sh shutdown --execute
scripts/cluster_power_console.sh startup --execute --force-flux-branch main
scripts/cluster_power_console.sh --delegate-host titan-24 shutdown --execute
USAGE
}
if [[ -d "${HOME}/Development/ananke" ]]; then
REPO_DIR="${HOME}/Development/ananke"
else
REPO_DIR="${HOME}/Development/titan-iac"
fi
DELEGATE_HOST="titan-db"
REMOTE_REPO_DIR="${ANANKE_REMOTE_REPO_DIR:-}"
while [[ $# -gt 0 ]]; do
case "$1" in
--repo-dir)
REPO_DIR="${2:-}"
shift 2
;;
--delegate-host)
DELEGATE_HOST="${2:-}"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
break
;;
esac
done
if [[ $# -lt 1 ]]; then
usage
exit 1
fi
SIBLING_SCRIPT="${SCRIPT_DIR}/cluster_power_recovery.sh"
REPO_SCRIPT="${REPO_DIR}/scripts/cluster_power_recovery.sh"
LOCAL_SCRIPT=""
if [[ -x "${SIBLING_SCRIPT}" ]]; then
LOCAL_SCRIPT="${SIBLING_SCRIPT}"
elif [[ -x "${REPO_SCRIPT}" ]]; then
LOCAL_SCRIPT="${REPO_SCRIPT}"
fi
if [[ -n "${LOCAL_SCRIPT}" ]] && command -v kubectl >/dev/null 2>&1; then
exec "${LOCAL_SCRIPT}" "$@"
fi
if [[ -z "${DELEGATE_HOST}" ]]; then
echo "cluster-power-console: no usable local recovery script found and no delegate host configured" >&2
exit 1
fi
quoted_args="$(printf '%q ' "$@")"
quoted_repo_dir="$(printf '%q' "${REPO_DIR}")"
remote_cmd=""
if [[ -n "${REMOTE_REPO_DIR}" ]]; then
remote_cmd+="ANANKE_REPO_DIR=$(printf '%q' "${REMOTE_REPO_DIR}") "
fi
remote_cmd+="if [ -x ~/ananke-tools/cluster_power_recovery.sh ]; then ~/ananke-tools/cluster_power_recovery.sh ${quoted_args}; elif [ -x ${quoted_repo_dir}/scripts/cluster_power_recovery.sh ]; then ${quoted_repo_dir}/scripts/cluster_power_recovery.sh ${quoted_args}; else echo 'cluster-power-console: remote recovery script not found' >&2; exit 1; fi"
exec ssh -o BatchMode=yes -o ConnectTimeout=8 "${DELEGATE_HOST}" "${remote_cmd}"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,163 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<USAGE
Usage: scripts/node_recover.sh <node-name> [options]
Options:
--yes Skip confirmation prompt
--skip-drain Do not cordon/drain; only capture recovery artifacts
--delete-node Delete Node object after drain (for hard-dead node replacement)
--out-dir <dir> Recovery artifact directory (default: ./artifacts/node-recovery)
-h, --help Show this help
USAGE
}
if ! command -v kubectl >/dev/null 2>&1; then
echo "kubectl is required" >&2
exit 1
fi
if ! command -v jq >/dev/null 2>&1; then
echo "jq is required" >&2
exit 1
fi
if [ "$#" -lt 1 ]; then
usage
exit 1
fi
node=""
assume_yes="false"
skip_drain="false"
delete_node="false"
out_dir="./artifacts/node-recovery"
while [ "$#" -gt 0 ]; do
case "$1" in
--yes)
assume_yes="true"
shift
;;
--skip-drain)
skip_drain="true"
shift
;;
--delete-node)
delete_node="true"
shift
;;
--out-dir)
out_dir="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
-*)
echo "Unknown option: $1" >&2
usage
exit 1
;;
*)
if [ -z "${node}" ]; then
node="$1"
else
echo "Unexpected argument: $1" >&2
usage
exit 1
fi
shift
;;
esac
done
if [ -z "${node}" ]; then
echo "Node name is required" >&2
usage
exit 1
fi
if ! kubectl get node "${node}" >/dev/null 2>&1; then
echo "Node ${node} not found in cluster API" >&2
exit 1
fi
if [ "${assume_yes}" != "true" ]; then
echo "About to prepare recovery workflow for node: ${node}"
echo "skip_drain=${skip_drain} delete_node=${delete_node}"
read -r -p "Type the node name to continue: " confirm
if [ "${confirm}" != "${node}" ]; then
echo "Confirmation did not match node name; aborting."
exit 1
fi
fi
timestamp="$(date +%Y%m%d-%H%M%S)"
artifacts_dir="${out_dir}/${node}-${timestamp}"
mkdir -p "${artifacts_dir}"
echo "Saving node and workload artifacts to ${artifacts_dir}"
kubectl get node "${node}" -o json > "${artifacts_dir}/node.json"
kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt"
kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt"
jq -r '
.metadata.labels
| to_entries[]
| select(
.key != "kubernetes.io/hostname"
and .key != "beta.kubernetes.io/hostname"
and .key != "node.kubernetes.io/instance-type"
and .key != "beta.kubernetes.io/instance-type"
and (.key | startswith("kubernetes.io/") | not)
and (.key | startswith("beta.kubernetes.io/") | not)
and (.key | startswith("node.kubernetes.io/") | not)
)
| "kubectl label node <replacement-node> " + .key + "=" + .value + " --overwrite"
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh"
jq -r '
(.spec.taints // [])[]
| "kubectl taint node <replacement-node> "
+ .key
+ (if .value then "=" + .value else "" end)
+ ":"
+ .effect
+ " --overwrite"
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh"
chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh"
if [ "${skip_drain}" != "true" ]; then
echo "Cordoning ${node}"
kubectl cordon "${node}" || true
echo "Draining ${node}"
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then
echo "Standard drain failed; retrying with --force"
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then
echo "Force drain failed; retrying with --disable-eviction"
kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction
fi
fi
fi
if [ "${delete_node}" = "true" ]; then
echo "Deleting node object ${node}"
kubectl delete node "${node}" || true
fi
cat <<NEXT
Recovery prep complete for ${node}.
Artifacts: ${artifacts_dir}
Next steps:
1) Reimage/reprovision replacement host.
2) Rejoin k3s and wait for node Ready.
3) Reapply labels: ${artifacts_dir}/restore-labels.sh
4) Reapply taints: ${artifacts_dir}/restore-taints.sh
5) Validate pods and uncordon replacement when ready.
NEXT

View File

@ -4,21 +4,13 @@ import pathlib
def load_module():
path = pathlib.Path(__file__).resolve().parents[1] / "dashboards_render_atlas.py"
spec = importlib.util.spec_from_file_location("scripts.dashboards_render_atlas", path)
spec = importlib.util.spec_from_file_location("dashboards_render_atlas", path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def flatten_panels(panels):
flat = []
for panel in panels:
flat.append(panel)
flat.extend(panel.get("panels", []))
return flat
def test_table_panel_options_and_filterable():
mod = load_module()
panel = mod.table_panel(
@ -64,71 +56,3 @@ def test_render_configmap_writes(tmp_path):
content = (tmp_path / "cm.yaml").read_text()
assert "kind: ConfigMap" in content
assert f"{uid}.json" in content
def test_testing_suite_variable_uses_canonical_values_only():
mod = load_module()
variable = mod.testing_suite_variable()
canonical_matcher = "|".join(mod.PLATFORM_TEST_SUITE_NAMES)
legacy_names = {"bstein-home", "data-prepper", "titan-iac", "pegasus-health"}
assert variable["allValue"] == canonical_matcher
assert not any(alias in variable["query"] for alias in legacy_names)
assert not any(alias in variable["allValue"] for alias in legacy_names)
assert [option["value"] for option in variable["options"]] == mod.PLATFORM_TEST_SUITE_NAMES
def test_jobs_dashboard_separates_current_gate_health_from_reliability():
mod = load_module()
dashboard = mod.build_jobs_dashboard()
panels_by_title = {panel["title"]: panel for panel in flatten_panels(dashboard["panels"])}
assert "Current Gate Health by Suite" in panels_by_title
assert "Run Reliability by Suite (24h)" in panels_by_title
assert "Run Reliability History by Suite" in panels_by_title
assert "Failures by Suite (24h)" not in panels_by_title
assert "Success Rate by Suite (24h)" not in panels_by_title
current_gate_expr = panels_by_title["Current Gate Health by Suite"]["targets"][0]["expr"]
assert 'check)' in current_gate_expr
assert 'result=~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr
reliability_panel = panels_by_title["Run Reliability by Suite (24h)"]
reliability_expr = reliability_panel["targets"][0]["expr"]
assert "platform_quality_gate_runs_total" in reliability_expr
assert "> 0" in reliability_expr
assert "- 1" in reliability_expr
assert reliability_panel["fieldConfig"]["defaults"]["mappings"] == [
{"type": "value", "options": {"-1": {"text": "no runs"}}}
]
def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint():
mod = load_module()
dashboard = mod.build_jobs_dashboard()
panels = dashboard["panels"]
rows = [panel for panel in panels if panel["type"] == "row"]
visible_query_panels = [panel for panel in panels if panel["type"] != "row"]
nested_panels_by_title = {
child["title"]: child
for row in rows
for child in row.get("panels", [])
}
assert len(panels) == 16
assert len(visible_query_panels) == 11
assert sum(len(panel.get("targets", [])) for panel in visible_query_panels) == 11
assert [row["title"] for row in rows] == [
"Reliability And Run History",
"Failure Trends By Check",
"Success Trends By Check",
"Test Drilldowns And Problem Tests",
"Telemetry Completeness, SonarQube, And Branches",
]
assert all(row["collapsed"] for row in rows)
assert "Failure Trend: Coverage" in nested_panels_by_title
assert "Success Trend: Supply Chain" in nested_panels_by_title
assert "Selected Test Pass Rate History" in nested_panels_by_title
assert "Missing Coverage Metrics by Suite" in nested_panels_by_title
assert "SonarQube API Up" in nested_panels_by_title

View File

@ -1,7 +1,5 @@
import importlib.util
import pathlib
import sys
import types
import pytest
@ -22,26 +20,6 @@ def load_sync_module(monkeypatch):
}
for k, v in env.items():
monkeypatch.setenv(k, v)
fake_psycopg2 = types.ModuleType("psycopg2")
fake_psycopg2.Error = Exception
fake_psycopg2.connect = lambda **kwargs: None
fake_psycopg2_extras = types.ModuleType("psycopg2.extras")
fake_psycopg2_extras.RealDictCursor = object
fake_passlib = types.ModuleType("passlib")
fake_passlib_hash = types.ModuleType("passlib.hash")
class _FakeBcryptSha256:
@staticmethod
def hash(password):
return f"stub:{password}"
fake_passlib_hash.bcrypt_sha256 = _FakeBcryptSha256
fake_passlib.hash = fake_passlib_hash
monkeypatch.setitem(sys.modules, "psycopg2", fake_psycopg2)
monkeypatch.setitem(sys.modules, "psycopg2.extras", fake_psycopg2_extras)
monkeypatch.setitem(sys.modules, "passlib", fake_passlib)
monkeypatch.setitem(sys.modules, "passlib.hash", fake_passlib_hash)
module_path = (
pathlib.Path(__file__).resolve().parents[2]
/ "services"
@ -138,100 +116,6 @@ def test_kc_get_users_paginates(monkeypatch):
assert sync.SESSION.calls == 1
def test_kc_get_users_fetches_second_page_after_full_batch(monkeypatch):
sync = load_sync_module(monkeypatch)
class _PagedSession:
def __init__(self):
self.calls = 0
self.first_params = []
def get(self, *_, **kwargs):
self.calls += 1
self.first_params.append(kwargs["params"]["first"])
if self.calls == 1:
return _FakeResponse([{"id": f"u{i}"} for i in range(200)])
return _FakeResponse([{"id": "last"}])
sync.SESSION = _PagedSession()
users = sync.kc_get_users("tok")
assert len(users) == 201
assert sync.SESSION.first_params == [0, 200]
def test_get_kc_token_posts_client_credentials(monkeypatch):
sync = load_sync_module(monkeypatch)
calls = []
class _TokenSession:
def post(self, url, data, timeout):
calls.append((url, data, timeout))
return _FakeResponse({"access_token": "tok"})
sync.SESSION = _TokenSession()
assert sync.get_kc_token() == "tok"
assert calls[0][1]["grant_type"] == "client_credentials"
def test_retry_request_retries_then_succeeds(monkeypatch):
sync = load_sync_module(monkeypatch)
attempts = []
sleeps = []
def _flaky():
attempts.append(1)
if len(attempts) == 1:
raise sync.requests.RequestException("temporary")
return "ok"
monkeypatch.setattr(sync.time, "sleep", lambda seconds: sleeps.append(seconds))
assert sync.retry_request("request", _flaky, attempts=2) == "ok"
assert sleeps == [2]
def test_retry_request_reraises_final_error(monkeypatch):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.time, "sleep", lambda seconds: None)
with pytest.raises(sync.requests.RequestException):
sync.retry_request(
"request",
lambda: (_ for _ in ()).throw(sync.requests.RequestException("nope")),
attempts=1,
)
def test_retry_db_connect_retries_then_succeeds(monkeypatch):
sync = load_sync_module(monkeypatch)
attempts = []
sleeps = []
def _connect(**kwargs):
attempts.append(kwargs)
if len(attempts) == 1:
raise sync.psycopg2.Error("not yet")
return "conn"
monkeypatch.setattr(sync.psycopg2, "connect", _connect)
monkeypatch.setattr(sync.time, "sleep", lambda seconds: sleeps.append(seconds))
assert sync.retry_db_connect(attempts=2) == "conn"
assert sleeps == [2]
def test_retry_db_connect_reraises_final_error(monkeypatch):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.psycopg2, "connect", lambda **kwargs: (_ for _ in ()).throw(sync.psycopg2.Error("down")))
monkeypatch.setattr(sync.time, "sleep", lambda seconds: None)
with pytest.raises(sync.psycopg2.Error):
sync.retry_db_connect(attempts=1)
def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
sync = load_sync_module(monkeypatch)
executed = []
@ -260,87 +144,6 @@ def test_ensure_mailu_user_upserts(monkeypatch):
assert captured["password"] != "pw"
def test_attribute_and_email_helpers(monkeypatch):
sync = load_sync_module(monkeypatch)
assert sync.get_attribute_value({"x": ["first", "second"]}, "x") == "first"
assert sync.get_attribute_value({"x": []}, "x") is None
assert sync.get_attribute_value({"x": "value"}, "x") == "value"
assert sync.mailu_enabled({"mailu_email": ["legacy@example.com"]}) is True
assert sync.mailu_enabled({"mailu_enabled": ["off"]}) is False
assert sync.resolve_mailu_email({"username": "fallback", "email": "user@example.com"}, {}) == "user@example.com"
assert sync.resolve_mailu_email({"username": "fallback", "email": "user@other.com"}, {}) == "fallback@example.com"
def test_safe_update_payload_filters_fields(monkeypatch):
sync = load_sync_module(monkeypatch)
payload = sync._safe_update_payload(
{
"username": "user",
"enabled": True,
"email": "user@example.com",
"emailVerified": False,
"firstName": "User",
"lastName": "Example",
"requiredActions": ["UPDATE_PASSWORD", 7],
"attributes": "not-a-dict",
"ignored": "value",
}
)
assert payload == {
"username": "user",
"enabled": True,
"email": "user@example.com",
"emailVerified": False,
"firstName": "User",
"lastName": "Example",
"requiredActions": ["UPDATE_PASSWORD"],
"attributes": {},
}
def test_ensure_system_mailboxes_handles_configurations(monkeypatch, capsys):
sync = load_sync_module(monkeypatch)
ensured = []
monkeypatch.setattr(sync, "MAILU_SYSTEM_USERS", ["postmaster@example.com", "abuse"])
monkeypatch.setattr(sync, "MAILU_SYSTEM_PASSWORD", "")
sync.ensure_system_mailboxes(object())
assert "MAILU_SYSTEM_PASSWORD is missing" in capsys.readouterr().out
def _ensure(cursor, email, password, display_name):
ensured.append((email, password, display_name))
if email == "abuse":
raise RuntimeError("boom")
monkeypatch.setattr(sync, "MAILU_SYSTEM_PASSWORD", "pw")
monkeypatch.setattr(sync, "ensure_mailu_user", _ensure)
sync.ensure_system_mailboxes(object())
out = capsys.readouterr().out
assert ensured == [
("postmaster@example.com", "pw", "postmaster"),
("abuse", "pw", "abuse"),
]
assert "Ensured system mailbox for postmaster@example.com" in out
assert "Failed to ensure system mailbox abuse" in out
def test_main_exits_without_users_or_system_mailboxes(monkeypatch, capsys):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync, "MAILU_SYSTEM_USERS", [])
monkeypatch.setattr(sync, "get_kc_token", lambda: "tok")
monkeypatch.setattr(sync, "kc_get_users", lambda token: [])
sync.main()
assert "No users found; exiting." in capsys.readouterr().out
def test_main_generates_password_and_upserts(monkeypatch):
sync = load_sync_module(monkeypatch)
monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")

View File

@ -1,134 +0,0 @@
import importlib.util
import io
import pathlib
import types
def load_listener_module(monkeypatch):
monkeypatch.setenv("MAILU_SYNC_WAIT_TIMEOUT_SEC", "0")
module_path = (
pathlib.Path(__file__).resolve().parents[2]
/ "services"
/ "mailu"
/ "scripts"
/ "mailu_sync_listener.py"
)
spec = importlib.util.spec_from_file_location("mailu_sync_listener_testmod", module_path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
def _handler_for(listener, body):
handler = listener.Handler.__new__(listener.Handler)
raw = body if isinstance(body, bytes) else body.encode()
handler.headers = {"Content-Length": str(len(raw))}
handler.rfile = io.BytesIO(raw)
handler.responses = []
handler.headers_ended = 0
handler.send_response = lambda code: handler.responses.append(code)
handler.end_headers = lambda: setattr(handler, "headers_ended", handler.headers_ended + 1)
return handler
def test_listener_run_sync_blocking_updates_state(monkeypatch):
listener = load_listener_module(monkeypatch)
monkeypatch.setattr(listener, "time", lambda: 42.0)
monkeypatch.setattr(
listener.subprocess,
"run",
lambda command, check: types.SimpleNamespace(returncode=3),
)
assert listener._run_sync_blocking() == 3
assert listener.last_rc == 3
assert listener.last_run == 42.0
assert listener.sync_done.is_set()
listener.sync_running = True
assert listener._run_sync_blocking() == 0
def test_listener_trigger_sync_async_honors_running_and_debounce(monkeypatch):
listener = load_listener_module(monkeypatch)
starts = []
class _Thread:
def __init__(self, target, daemon):
self.target = target
self.daemon = daemon
def start(self):
starts.append((self.target, self.daemon))
monkeypatch.setattr(listener.threading, "Thread", _Thread)
monkeypatch.setattr(listener, "time", lambda: 100.0)
listener.sync_running = True
assert listener._trigger_sync_async() is False
listener.sync_running = False
listener.last_run = 95.0
assert listener._trigger_sync_async() is False
assert listener._trigger_sync_async(force=True) is True
assert starts and starts[0][1] is True
def test_listener_post_rejects_invalid_json(monkeypatch):
listener = load_listener_module(monkeypatch)
handler = _handler_for(listener, b"{not-json")
handler.do_POST()
assert handler.responses == [400]
assert handler.headers_ended == 1
def test_listener_post_triggers_async_without_wait(monkeypatch):
listener = load_listener_module(monkeypatch)
called = []
monkeypatch.setattr(listener, "_trigger_sync_async", lambda force=False: called.append(force) or True)
handler = _handler_for(listener, '{"force": true}')
handler.do_POST()
assert called == [True]
assert handler.responses == [202]
def test_listener_post_wait_returns_success_or_failure(monkeypatch):
listener = load_listener_module(monkeypatch)
called = []
monkeypatch.setattr(listener, "_trigger_sync_async", lambda force=False: called.append(force) or True)
listener.sync_running = False
listener.last_rc = 0
handler = _handler_for(listener, '{"wait": true, "force": true}')
handler.do_POST()
assert called == [True]
assert handler.responses == [200]
listener.last_rc = 2
handler = _handler_for(listener, '{"wait": true}')
handler.do_POST()
assert handler.responses == [500]
def test_listener_post_wait_keeps_running_request_successful(monkeypatch):
listener = load_listener_module(monkeypatch)
listener.sync_running = True
handler = _handler_for(listener, '{"wait": true}')
handler.do_POST()
assert handler.responses == [200]
def test_listener_log_message_is_quiet(monkeypatch):
listener = load_listener_module(monkeypatch)
handler = listener.Handler.__new__(listener.Handler)
assert handler.log_message("ignored %s", "value") is None

View File

@ -1,73 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
MODE="${1:-dry-run}"
if [[ "$MODE" != "dry-run" && "$MODE" != "active" ]]; then
echo "usage: $0 [dry-run|active]" >&2
exit 2
fi
EXPECTED_DRY_RUN="true"
PROM_MODE="dry_run"
if [[ "$MODE" == "active" ]]; then
EXPECTED_DRY_RUN="false"
PROM_MODE="delete"
fi
KUSTOMIZATION="${KUSTOMIZATION:-maintenance}"
NAMESPACE="${NAMESPACE:-maintenance}"
DEPLOYMENT="${DEPLOYMENT:-ariadne}"
LOCAL_METRICS_PORT="${LOCAL_METRICS_PORT:-18080}"
for cmd in flux kubectl curl grep awk; do
if ! command -v "$cmd" >/dev/null 2>&1; then
echo "missing required command: $cmd" >&2
exit 2
fi
done
echo "[1/5] reconcile Flux kustomization: ${KUSTOMIZATION}"
flux reconcile kustomization "$KUSTOMIZATION" --namespace flux-system --with-source
echo "[2/5] wait for deployment rollout"
kubectl -n "$NAMESPACE" rollout status "deployment/$DEPLOYMENT" --timeout=5m
echo "[3/5] verify ariadne env wiring"
ENV_DUMP="$(kubectl -n "$NAMESPACE" get deployment "$DEPLOYMENT" -o jsonpath='{range .spec.template.spec.containers[0].env[*]}{.name}={.value}{"\n"}{end}')"
echo "$ENV_DUMP" | grep -F "ARIADNE_SCHEDULE_JENKINS_WORKSPACE_CLEANUP=45 */6 * * *"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_NAMESPACE=jenkins"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_PVC_PREFIX=pvc-workspace-"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_CLEANUP_MIN_AGE_HOURS=24"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_CLEANUP_DRY_RUN=${EXPECTED_DRY_RUN}"
echo "$ENV_DUMP" | grep -F "JENKINS_WORKSPACE_CLEANUP_MAX_DELETIONS_PER_RUN=20"
echo "[4/5] scrape /metrics and confirm cleanup metrics are exported"
PF_LOG="$(mktemp)"
METRICS_FILE="$(mktemp)"
cleanup() {
if [[ -n "${PF_PID:-}" ]]; then
kill "$PF_PID" >/dev/null 2>&1 || true
wait "$PF_PID" 2>/dev/null || true
fi
rm -f "$PF_LOG" "$METRICS_FILE"
}
trap cleanup EXIT
kubectl -n "$NAMESPACE" port-forward "deployment/$DEPLOYMENT" "${LOCAL_METRICS_PORT}:8080" >"$PF_LOG" 2>&1 &
PF_PID=$!
sleep 2
curl -fsS "http://127.0.0.1:${LOCAL_METRICS_PORT}/metrics" >"$METRICS_FILE"
grep -F "# HELP ariadne_jenkins_workspace_cleanup_runs_total" "$METRICS_FILE"
grep -F "# HELP ariadne_jenkins_workspace_cleanup_objects_total" "$METRICS_FILE"
echo "[5/5] show recent cleanup signal"
if grep -q "ariadne_jenkins_workspace_cleanup_runs_total" "$METRICS_FILE"; then
grep "ariadne_jenkins_workspace_cleanup_runs_total" "$METRICS_FILE" | grep "mode=\"${PROM_MODE}\"" || true
else
echo "No run counter sample yet for mode=${PROM_MODE}; wait for schedule window and re-run." >&2
fi
echo "Recent cleanup logs (if any):"
kubectl -n "$NAMESPACE" logs "deployment/$DEPLOYMENT" --tail=500 | grep -i "jenkins workspace cleanup" | tail -n 20 || true
echo "verification complete for mode=${MODE}"

View File

@ -5,7 +5,7 @@ metadata:
name: ollama
namespace: ai
spec:
replicas: 0
replicas: 1
revisionHistoryLimit: 2
strategy:
type: RollingUpdate
@ -21,7 +21,7 @@ spec:
app: ollama
annotations:
ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0
ai.bstein.dev/gpu: GPU pool (titan-20/21)
ai.bstein.dev/gpu: GPU pool (titan-22/24)
ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
spec:
affinity:
@ -32,13 +32,13 @@ spec:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- titan-22
- titan-24
runtimeClassName: nvidia
volumes:
- name: models
persistentVolumeClaim:
claimName: ollama-models-asteria
claimName: ollama-models
initContainers:
- name: warm-model
image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d

View File

@ -2,12 +2,12 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ollama-models-asteria
name: ollama-models
namespace: ai
spec:
accessModes:
- ReadWriteMany
- ReadWriteOnce
resources:
requests:
storage: 30Gi
storageClassName: asteria
storageClassName: astreae

View File

@ -49,15 +49,6 @@ spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13", "titan-15", "titan-17", "titan-19"]
imagePullSecrets:
- name: harbor-regcred
containers:

View File

@ -38,36 +38,6 @@ spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi4"]
containers:
- name: gateway
image: python:3.11-slim

View File

@ -26,7 +26,7 @@ spec:
imagePullPolicy: Always
ports:
- name: http
containerPort: 8080
containerPort: 80
readinessProbe:
httpGet:
path: /

View File

@ -10,4 +10,4 @@ spec:
ports:
- name: http
port: 80
targetPort: 8080
targetPort: 80

View File

@ -20,9 +20,9 @@ resources:
- ingress.yaml
images:
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend
newTag: 0.1.1-267 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
newTag: 0.1.1-263 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
- name: registry.bstein.dev/bstein/bstein-dev-home-backend
newTag: 0.1.1-267 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
newTag: 0.1.1-263 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
configMapGenerator:
- name: chat-ai-gateway
namespace: bstein-dev-home

View File

@ -16,7 +16,7 @@ spec:
labels:
app: atlasbot
annotations:
checksum/atlasbot-configmap: manual-atlasbot-103
checksum/atlasbot-configmap: manual-atlasbot-101
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "comms"
vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@ -28,15 +28,6 @@ spec:
vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-bot-genius-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-bot-genius-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-genius-password" }}{{- end -}}
vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
vault.hashicorp.com/agent-inject-template-seeder-pass: |
{{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -85,41 +76,23 @@ spec:
- name: ARIADNE_STATE_URL
value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
- name: BOT_USER
value: atlas-smart
- name: BOT_USER_QUICK
value: atlas-quick
- name: BOT_USER_SMART
value: atlas-smart
- name: BOT_USER_GENIUS
value: atlas-genius
value: atlasbot
- name: BOT_MENTIONS
value: atlas-quick,atlas-smart,atlas-genius,atlas_quick,atlas_smart,atlas_genius
value: atlasbot,aatlasbot,atlas_quick,atlas_smart
- name: OLLAMA_URL
value: http://ollama.ai.svc.cluster.local:11434
- name: OLLAMA_MODEL
value: qwen2.5:14b-instruct-q4_0
value: qwen2.5:14b-instruct
- name: ATLASBOT_MODEL_FAST
value: qwen2.5-coder:7b-instruct-q4_0
- name: ATLASBOT_MODEL_SMART
value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_GENIUS
value: qwen2.5:14b-instruct-q4_0
- name: ATLASBOT_MODEL_DEEP
value: qwen2.5:14b-instruct-q4_0
value: qwen2.5:14b-instruct
- name: OLLAMA_FALLBACK_MODEL
value: qwen2.5:14b-instruct-q4_0
- name: OLLAMA_TIMEOUT_SEC
value: "600"
- name: ATLASBOT_QUICK_TIME_BUDGET_SEC
value: "15"
- name: ATLASBOT_SMART_TIME_BUDGET_SEC
value: "45"
- name: ATLASBOT_GENIUS_TIME_BUDGET_SEC
value: "180"
- name: ATLASBOT_OLLAMA_RETRIES
value: "0"
- name: ATLASBOT_THINKING_INTERVAL_SEC
value: "30"
value: "120"
- name: ATLASBOT_SNAPSHOT_TTL_SEC
value: "30"
- name: ATLASBOT_HTTP_PORT

View File

@ -17,7 +17,6 @@ spec:
spec:
nodeSelector:
hardware: rpi5
node-role.kubernetes.io/worker: "true"
containers:
- name: element-call
image: ghcr.io/element-hq/element-call@sha256:e6897c7818331714eae19d83ef8ea94a8b41115f0d8d3f62c2fed2d02c65c9bc

View File

@ -119,7 +119,6 @@ spec:
> /synapse/config/conf.d/runtime-secrets.yaml
nodeSelector:
hardware: rpi5
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
@ -418,7 +417,6 @@ spec:
nodeSelector:
hardware: rpi5
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:

View File

@ -1,19 +1,5 @@
# Metis (node recovery)
## Fast path (SD/media failure)
1. Run `scripts/node_recover.sh <node> --yes --delete-node` from `titan-iac`.
2. Reimage/reprovision the replacement host.
3. Rejoin the replacement node to k3s.
4. Reapply labels and taints from generated artifacts:
- `artifacts/node-recovery/<node>-<timestamp>/restore-labels.sh`
- `artifacts/node-recovery/<node>-<timestamp>/restore-taints.sh`
5. Verify workloads, then uncordon the replacement node.
### Notes
- `node_recover.sh` snapshots node labels/taints and current pod placement before drain.
- Use `--skip-drain` for a dead/unreachable node where only artifact capture is possible.
- Use `--delete-node` after drain (or for hard-dead nodes) so replacement join is clean.
## Node classes (current map)
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)

View File

@ -1,12 +1,12 @@
# services/comms/oneoffs/synapse-user-seed-job.yaml
# One-off job for comms/synapse-user-seed-9.
# Purpose: synapse user seed 9 (see container args/env in this file).
# One-off job for comms/synapse-user-seed-8.
# Purpose: synapse user seed 8 (see container args/env in this file).
# Run by setting spec.suspend to false, reconcile, then set it back to true.
# Safe to delete the finished Job/pod; it should not run continuously.
apiVersion: batch/v1
kind: Job
metadata:
name: synapse-user-seed-9
name: synapse-user-seed-8
namespace: comms
spec:
suspend: true

View File

@ -11,39 +11,27 @@ from urllib import error, parse, request
BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008")
AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080")
BOT_USER = os.environ["BOT_USER"]
BOT_PASS = os.environ["BOT_PASS"]
BOT_USER_QUICK = os.environ.get("BOT_USER_QUICK", "").strip()
BOT_PASS_QUICK = os.environ.get("BOT_PASS_QUICK", "").strip()
BOT_USER_SMART = os.environ.get("BOT_USER_SMART", "").strip()
BOT_PASS_SMART = os.environ.get("BOT_PASS_SMART", "").strip()
BOT_USER_GENIUS = os.environ.get("BOT_USER_GENIUS", "").strip()
BOT_PASS_GENIUS = os.environ.get("BOT_PASS_GENIUS", "").strip()
USER = BOT_USER
PASSWORD = BOT_PASS
USER = os.environ["BOT_USER"]
PASSWORD = os.environ["BOT_PASS"]
ROOM_ALIAS = "#othrys:live.bstein.dev"
OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5:14b-instruct")
MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "")
MODEL_SMART = os.environ.get("ATLASBOT_MODEL_SMART", os.environ.get("ATLASBOT_MODEL_DEEP", "")).strip()
MODEL_GENIUS = os.environ.get("ATLASBOT_MODEL_GENIUS", MODEL_SMART).strip()
MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "")
FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "")
API_KEY = os.environ.get("CHAT_API_KEY", "")
OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090"))
ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "")
SNAPSHOT_TTL_SEC = int(os.environ.get("ATLASBOT_SNAPSHOT_TTL_SEC", "30"))
LOGIN_RETRY_CAP_SEC = int(os.environ.get("ATLASBOT_LOGIN_RETRY_CAP_SEC", "60"))
# 0 means retry forever (default); useful during startup when MAS/Synapse ordering is still converging.
LOGIN_MAX_ATTEMPTS = int(os.environ.get("ATLASBOT_LOGIN_MAX_ATTEMPTS", "0"))
KB_DIR = os.environ.get("KB_DIR", "")
VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428")
ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "")
ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "")
BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{BOT_USER},atlas")
BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas")
SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
@ -51,9 +39,6 @@ MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000"))
MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000"))
THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
QUICK_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_QUICK_TIME_BUDGET_SEC", "15"))
SMART_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_SMART_TIME_BUDGET_SEC", "45"))
GENIUS_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_GENIUS_TIME_BUDGET_SEC", "180"))
OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2"))
OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false"
@ -395,103 +380,27 @@ def _strip_bot_mention(text: str) -> str:
return cleaned or text.strip()
def _detect_mode_from_body(body: str, *, default: str = "smart") -> str:
def _detect_mode_from_body(body: str, *, default: str = "deep") -> str:
lower = normalize_query(body or "")
if "atlas_quick" in lower or "atlas-quick" in lower:
return "fast"
if "atlas_smart" in lower or "atlas-smart" in lower:
return "smart"
if "atlas_genius" in lower or "atlas-genius" in lower:
return "genius"
return "deep"
if lower.startswith("quick ") or lower.startswith("fast "):
return "fast"
if lower.startswith("smart "):
return "smart"
if lower.startswith("genius ") or lower.startswith("deep "):
return "genius"
if lower.startswith("smart ") or lower.startswith("deep "):
return "deep"
return default
def _detect_mode(
content: dict[str, Any],
body: str,
*,
default: str = "smart",
account_user: str = "",
) -> str:
mode = _detect_mode_from_body(body, default=default)
mentions = content.get("m.mentions", {})
user_ids = mentions.get("user_ids", [])
if isinstance(user_ids, list):
normalized = {normalize_user_id(uid).lower() for uid in user_ids if isinstance(uid, str)}
if BOT_USER_QUICK and normalize_user_id(BOT_USER_QUICK).lower() in normalized:
return "fast"
if BOT_USER_SMART and normalize_user_id(BOT_USER_SMART).lower() in normalized:
return "smart"
if BOT_USER_GENIUS and normalize_user_id(BOT_USER_GENIUS).lower() in normalized:
return "genius"
if BOT_USER and normalize_user_id(BOT_USER).lower() in normalized:
return "smart"
if account_user and BOT_USER_QUICK and normalize_user_id(account_user) == normalize_user_id(BOT_USER_QUICK):
return "fast"
if account_user and BOT_USER_SMART and normalize_user_id(account_user) == normalize_user_id(BOT_USER_SMART):
return "smart"
if account_user and BOT_USER_GENIUS and normalize_user_id(account_user) == normalize_user_id(BOT_USER_GENIUS):
return "genius"
return mode
def _model_for_mode(mode: str) -> str:
if mode == "fast" and MODEL_FAST:
return MODEL_FAST
if mode == "smart" and MODEL_SMART:
return MODEL_SMART
if mode == "genius" and MODEL_GENIUS:
return MODEL_GENIUS
if mode == "deep" and MODEL_SMART:
return MODEL_SMART
if mode == "deep" and MODEL_DEEP:
return MODEL_DEEP
return MODEL
def _normalize_mode(mode: str) -> str:
normalized = (mode or "").strip().lower()
if normalized in {"quick", "fast"}:
return "fast"
if normalized in {"smart"}:
return "smart"
if normalized in {"genius", "deep"}:
return "genius"
return "smart"
def _mode_time_budget_sec(mode: str) -> float:
normalized = _normalize_mode(mode)
if normalized == "fast":
return max(1.0, QUICK_TIME_BUDGET_SEC)
if normalized == "smart":
return max(1.0, SMART_TIME_BUDGET_SEC)
if normalized == "genius":
return max(1.0, GENIUS_TIME_BUDGET_SEC)
return max(1.0, SMART_TIME_BUDGET_SEC)
def _mode_ollama_timeout_sec(mode: str) -> float:
normalized = _normalize_mode(mode)
budget = _mode_time_budget_sec(normalized)
if normalized == "fast":
return max(6.0, min(budget - 2.0, OLLAMA_TIMEOUT_SEC))
if normalized == "smart":
return max(12.0, min(budget - 5.0, OLLAMA_TIMEOUT_SEC))
if normalized == "genius":
return max(20.0, min(budget - 10.0, OLLAMA_TIMEOUT_SEC))
return max(12.0, min(budget - 5.0, OLLAMA_TIMEOUT_SEC))
def _mode_heartbeat_sec(mode: str) -> int:
normalized = _normalize_mode(mode)
budget = _mode_time_budget_sec(normalized)
return max(5, min(THINKING_INTERVAL_SEC, int(max(5.0, budget / 3.0))))
# Matrix HTTP helper.
def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None):
url = (base or BASE) + path
@ -507,12 +416,12 @@ def req(method: str, path: str, token: str | None = None, body=None, timeout=60,
raw = resp.read()
return json.loads(raw.decode()) if raw else {}
def login(user: str, password: str) -> str:
login_user = normalize_user_id(user)
def login() -> str:
login_user = normalize_user_id(USER)
payload = {
"type": "m.login.password",
"identifier": {"type": "m.id.user", "user": login_user},
"password": password,
"password": PASSWORD,
}
res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE)
return res["access_token"]
@ -2128,9 +2037,6 @@ def _doc_intent(query: str) -> bool:
"triage",
"recover",
"remediate",
"longhorn",
"astreae",
"asteria",
)
)
@ -2722,11 +2628,6 @@ def _append_history_context(context: str, history_lines: list[str]) -> str:
return combined
def _merge_context_blocks(*blocks: str) -> str:
parts = [block.strip() for block in blocks if isinstance(block, str) and block.strip()]
return "\n\n".join(parts)
class ThoughtState:
def __init__(self, total_steps: int = 0):
self._lock = threading.Lock()
@ -3084,7 +2985,6 @@ def _ollama_call_safe(
fallback: str,
system_override: str | None = None,
model: str | None = None,
timeout: float | None = None,
) -> str:
try:
return _ollama_call(
@ -3094,7 +2994,6 @@ def _ollama_call_safe(
use_history=False,
system_override=system_override,
model=model,
timeout=timeout,
)
except Exception:
return fallback
@ -3914,12 +3813,9 @@ def _open_ended_multi(
def _open_ended_total_steps(mode: str) -> int:
normalized = _normalize_mode(mode)
if normalized == "fast":
if mode == "fast":
return 2
if normalized == "smart":
return 3
return 4
return 9
def _fast_fact_lines(
@ -4067,7 +3963,6 @@ def _fallback_fact_answer(prompt: str, context: str) -> str:
hottest_intent = any(word in q for word in ("hottest", "highest", "most", "top", "busiest"))
metric = _detect_metric(q)
include_hw, _exclude_hw = _detect_hardware_filters(q)
wants_longhorn = any(word in tokens for word in ("longhorn", "astreae", "asteria"))
if hottest_intent and metric in {"cpu", "ram", "net", "io"}:
hottest_val = _find_value(f"hottest_{metric}")
@ -4106,18 +4001,6 @@ def _fallback_fact_answer(prompt: str, context: str) -> str:
return f"Not ready nodes: {match.group(1)}."
if count_intent and include_hw:
if wants_longhorn:
for hw in include_hw:
for fact, key, val in parsed_facts:
key_tokens = set(_tokens(key or fact))
if "longhorn" not in key_tokens:
continue
if hw not in key_tokens:
continue
nodes = _extract_titan_nodes(val or fact)
if nodes:
return f"{hw} longhorn nodes: {len(nodes)}."
return ""
counts_line = _find_value("nodes_by_hardware_count")
if counts_line:
counts = _parse_counts(counts_line)
@ -4141,59 +4024,11 @@ def _fallback_fact_answer(prompt: str, context: str) -> str:
cp_nodes = _find_value("control_plane_nodes")
if cp_nodes:
return f"Control-plane nodes: {cp_nodes}."
if wants_longhorn:
for hw in include_hw:
best_nodes: list[str] = []
best_val = ""
for _fact, key, val in parsed_facts:
if not key or not val:
continue
key_tokens = set(_tokens(key))
if "longhorn" not in key_tokens:
continue
if hw not in key_tokens:
continue
nodes = _extract_titan_nodes(val)
if nodes and len(nodes) > len(best_nodes):
best_nodes = nodes
best_val = val
elif not best_nodes and val:
best_val = val
if best_nodes:
return f"{hw} longhorn nodes: {', '.join(best_nodes)}."
if best_val:
return f"{hw} longhorn nodes: {best_val}."
for hw in include_hw:
if wants_longhorn:
continue
hw_line = _find_value(hw)
if hw_line:
return f"{hw} nodes: {hw_line}."
if list_intent and "longhorn" in tokens:
best_nodes: list[str] = []
best_key = ""
best_val = ""
for _fact, key, val in parsed_facts:
if not key or not val:
continue
key_tokens = set(_tokens(key))
if "longhorn" not in key_tokens:
continue
nodes = _extract_titan_nodes(val)
if nodes and len(nodes) > len(best_nodes):
best_nodes = nodes
best_key = key
best_val = val
elif not best_nodes and val:
best_key = key
best_val = val
if best_nodes:
return f"Longhorn nodes: {', '.join(best_nodes)}."
if best_val:
label = (best_key or "Longhorn nodes").replace("_", " ").strip()
return f"{label.capitalize()}: {best_val}."
if list_intent and "control" in q:
cp_nodes = _find_value("control_plane_nodes")
if cp_nodes:
@ -4224,10 +4059,6 @@ def _fallback_fact_answer(prompt: str, context: str) -> str:
best_fact = ""
best_score = -1
for fact in facts:
if wants_longhorn:
fact_tokens = set(_tokens(fact))
if not ({"longhorn", "astreae", "asteria"} & fact_tokens):
continue
key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", fact)
if not key_match:
key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", fact)
@ -4305,7 +4136,6 @@ def _open_ended_fast_single(
prompt: str,
*,
context: str,
fallback_context: str | None = None,
history_lines: list[str] | None = None,
state: ThoughtState | None = None,
model: str,
@ -4313,26 +4143,24 @@ def _open_ended_fast_single(
if state:
state.update("drafting", step=1, note="summarizing")
working_context = _append_history_context(context, history_lines or []) if history_lines else context
reply = _ollama_call_safe(
reply = _ollama_call(
("atlasbot_fast", "atlasbot_fast"),
prompt,
context=working_context,
fallback="",
use_history=False,
system_override=_open_ended_system(),
model=model,
timeout=_mode_ollama_timeout_sec("fast"),
)
if not _has_body_lines(reply):
reply = _ollama_call_safe(
reply = _ollama_call(
("atlasbot_fast", "atlasbot_fast"),
prompt + " Provide one clear sentence before the score lines.",
context=working_context,
fallback="",
use_history=False,
system_override=_open_ended_system(),
model=model,
timeout=_mode_ollama_timeout_sec("fast"),
)
fallback = _fallback_fact_answer(prompt, fallback_context or context)
fallback = _fallback_fact_answer(prompt, context)
if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
reply = fallback
if not _has_body_lines(reply):
@ -4349,7 +4177,6 @@ def _open_ended_fast(
fact_lines: list[str],
fact_meta: dict[str, dict[str, Any]],
history_lines: list[str],
extra_context: str = "",
state: ThoughtState | None = None,
) -> str:
model = _model_for_mode("fast")
@ -4370,7 +4197,6 @@ def _open_ended_fast(
selected_pack = _fact_pack_text(selected_lines, selected_meta)
if _needs_full_fact_pack(prompt) or not selected_lines:
selected_pack = fact_pack
model_context = _merge_context_blocks(selected_pack, extra_context)
if not subjective and _needs_full_fact_pack(prompt):
fallback = _fallback_fact_answer(prompt, fact_pack)
if fallback:
@ -4379,8 +4205,7 @@ def _open_ended_fast(
state.total_steps = _open_ended_total_steps("fast")
return _open_ended_fast_single(
prompt,
context=model_context,
fallback_context=selected_pack,
context=selected_pack,
history_lines=history_lines,
state=state,
model=model,
@ -4394,55 +4219,16 @@ def _open_ended_deep(
fact_lines: list[str],
fact_meta: dict[str, dict[str, Any]],
history_lines: list[str],
mode: str,
extra_context: str = "",
state: ThoughtState | None = None,
) -> str:
normalized = _normalize_mode(mode)
model = _model_for_mode(normalized)
subjective = _is_subjective_query(prompt)
primary_tags = _primary_tags_for_prompt(prompt)
focus_tags = _preferred_tags_for_prompt(prompt)
if not focus_tags and subjective:
focus_tags = set(_ALLOWED_INSIGHT_TAGS)
avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set()
limit = 12 if normalized == "smart" else 18
selected_lines = _fast_fact_lines(
fact_lines,
fact_meta,
focus_tags=focus_tags,
avoid_tags=avoid_tags,
primary_tags=primary_tags,
limit=limit,
)
selected_meta = _fact_pack_meta(selected_lines)
selected_pack = _fact_pack_text(selected_lines, selected_meta)
if _needs_full_fact_pack(prompt) or not selected_lines or normalized == "genius":
selected_pack = fact_pack
fallback = _fallback_fact_answer(prompt, selected_pack)
model_context = _merge_context_blocks(selected_pack, extra_context)
if not subjective and fallback:
if state:
state.update("done", step=_open_ended_total_steps(normalized))
return _ensure_scores(fallback)
if state:
state.update("drafting", step=1, note="synthesizing")
reply = _ollama_call_safe(
("atlasbot_deep", "atlasbot_deep"),
return _open_ended_multi(
prompt,
context=_append_history_context(model_context, history_lines),
fallback="",
system_override=_open_ended_system(),
model=model,
timeout=_mode_ollama_timeout_sec(normalized),
fact_pack=fact_pack,
fact_lines=fact_lines,
fact_meta=fact_meta,
history_lines=history_lines,
state=state,
)
if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
reply = fallback
if not _has_body_lines(reply):
reply = "I don't have enough data in the current snapshot to answer that."
if state:
state.update("done", step=_open_ended_total_steps(normalized))
return _ensure_scores(reply)
def open_ended_answer(
@ -4454,7 +4240,6 @@ def open_ended_answer(
history_lines: list[str],
mode: str,
allow_tools: bool,
context: str = "",
state: ThoughtState | None = None,
) -> str:
lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
@ -4471,15 +4256,13 @@ def open_ended_answer(
return _ensure_scores("I don't have enough data to answer that.")
fact_meta = _fact_pack_meta(lines)
fact_pack = _fact_pack_text(lines, fact_meta)
normalized = _normalize_mode(mode)
if normalized == "fast":
if mode == "fast":
return _open_ended_fast(
prompt,
fact_pack=fact_pack,
fact_lines=lines,
fact_meta=fact_meta,
history_lines=history_lines,
extra_context=context,
state=state,
)
return _open_ended_deep(
@ -4488,8 +4271,6 @@ def open_ended_answer(
fact_lines=lines,
fact_meta=fact_meta,
history_lines=history_lines,
extra_context=context,
mode=normalized,
state=state,
)
@ -4511,7 +4292,6 @@ def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> s
use_history=False,
system_override=system,
model=model,
timeout=_mode_ollama_timeout_sec(mode),
)
reply = re.sub(r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", reply, flags=re.IGNORECASE).strip()
return _ensure_scores(reply)
@ -4563,7 +4343,13 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
self._write_json(400, {"error": "missing_prompt"})
return
cleaned = _strip_bot_mention(prompt)
mode = _normalize_mode(str(payload.get("mode") or "smart"))
mode = str(payload.get("mode") or "deep").lower()
if mode in ("quick", "fast"):
mode = "fast"
elif mode in ("smart", "deep"):
mode = "deep"
else:
mode = "deep"
snapshot = _snapshot_state()
inventory = _snapshot_inventory(snapshot) or node_inventory_live()
workloads = _snapshot_workloads(snapshot)
@ -4600,7 +4386,6 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
history_lines=history_lines,
mode=mode,
allow_tools=True,
context=context,
state=None,
)
else:
@ -4855,7 +4640,6 @@ def _ollama_call(
use_history: bool = True,
system_override: str | None = None,
model: str | None = None,
timeout: float | None = None,
) -> str:
system = system_override or (
"System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
@ -4889,7 +4673,6 @@ def _ollama_call(
messages.append({"role": "user", "content": prompt})
model_name = model or MODEL
request_timeout = timeout if timeout is not None else OLLAMA_TIMEOUT_SEC
payload = {"model": model_name, "messages": messages, "stream": False}
headers = {"Content-Type": "application/json"}
if API_KEY:
@ -4900,13 +4683,13 @@ def _ollama_call(
lock.acquire()
try:
try:
with request.urlopen(r, timeout=request_timeout) as resp:
with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
data = json.loads(resp.read().decode())
except error.HTTPError as exc:
if exc.code == 404 and FALLBACK_MODEL and FALLBACK_MODEL != payload["model"]:
payload["model"] = FALLBACK_MODEL
r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers)
with request.urlopen(r, timeout=request_timeout) as resp:
with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
data = json.loads(resp.read().decode())
else:
raise
@ -4931,7 +4714,6 @@ def ollama_reply(
fallback: str = "",
use_history: bool = True,
model: str | None = None,
timeout: float | None = None,
) -> str:
last_error = None
for attempt in range(max(1, OLLAMA_RETRIES + 1)):
@ -4942,7 +4724,6 @@ def ollama_reply(
context=context,
use_history=use_history,
model=model,
timeout=timeout,
)
except Exception as exc: # noqa: BLE001
last_error = exc
@ -4963,24 +4744,20 @@ def ollama_reply_with_thinking(
fallback: str,
use_history: bool = True,
model: str | None = None,
timeout: float | None = None,
) -> str:
result: dict[str, str] = {"reply": ""}
done = threading.Event()
def worker():
try:
result["reply"] = ollama_reply(
hist_key,
prompt,
context=context,
fallback=fallback,
use_history=use_history,
model=model,
timeout=timeout,
)
finally:
done.set()
result["reply"] = ollama_reply(
hist_key,
prompt,
context=context,
fallback=fallback,
use_history=use_history,
model=model,
)
done.set()
thread = threading.Thread(target=worker, daemon=True)
thread.start()
@ -5012,7 +4789,6 @@ def open_ended_with_thinking(
history_lines: list[str],
mode: str,
allow_tools: bool,
context: str = "",
) -> str:
result: dict[str, str] = {"reply": ""}
done = threading.Event()
@ -5020,26 +4796,23 @@ def open_ended_with_thinking(
state = ThoughtState(total_steps=total_steps)
def worker():
try:
result["reply"] = open_ended_answer(
prompt,
inventory=inventory,
snapshot=snapshot,
workloads=workloads,
history_lines=history_lines,
mode=mode,
allow_tools=allow_tools,
context=context,
state=state,
)
finally:
done.set()
result["reply"] = open_ended_answer(
prompt,
inventory=inventory,
snapshot=snapshot,
workloads=workloads,
history_lines=history_lines,
mode=mode,
allow_tools=allow_tools,
state=state,
)
done.set()
thread = threading.Thread(target=worker, daemon=True)
thread.start()
if not done.wait(2.0):
send_msg(token, room, "Thinking…")
heartbeat = _mode_heartbeat_sec(mode)
heartbeat = max(10, THINKING_INTERVAL_SEC)
next_heartbeat = time.monotonic() + heartbeat
while not done.wait(max(0, next_heartbeat - time.monotonic())):
send_msg(token, room, state.status_line())
@ -5047,7 +4820,7 @@ def open_ended_with_thinking(
thread.join(timeout=1)
return result["reply"] or "Model backend is busy. Try again in a moment."
def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str):
def sync_loop(token: str, room_id: str):
since = None
try:
res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10)
@ -5088,7 +4861,7 @@ def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str)
if not body:
continue
sender = ev.get("sender", "")
if account_user and sender == normalize_user_id(account_user):
if sender == f"@{USER}:live.bstein.dev":
continue
mentioned = is_mentioned(content, body)
@ -5101,12 +4874,7 @@ def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str)
cleaned_body = _strip_bot_mention(body)
lower_body = cleaned_body.lower()
mode = _detect_mode(
content,
body,
default=_normalize_mode(default_mode),
account_user=account_user,
)
mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep")
# Only do live cluster introspection in DMs.
allow_tools = is_dm
@ -5170,88 +4938,39 @@ def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str)
snapshot=snapshot,
workloads=workloads,
history_lines=history[hist_key],
mode=_normalize_mode(mode),
mode=mode if mode in ("fast", "deep") else "deep",
allow_tools=allow_tools,
context=context,
)
else:
reply = _non_cluster_reply(
cleaned_body,
history_lines=history[hist_key],
mode=_normalize_mode(mode),
mode=mode if mode in ("fast", "deep") else "deep",
)
send_msg(token, rid, reply)
history[hist_key].append(f"Atlas: {reply}")
history[hist_key] = history[hist_key][-80:]
def login_with_retry(user: str, password: str):
attempts = 0
while True:
def login_with_retry():
last_err = None
for attempt in range(10):
try:
return login(user, password)
return login()
except Exception as exc: # noqa: BLE001
attempts += 1
if LOGIN_MAX_ATTEMPTS > 0 and attempts >= LOGIN_MAX_ATTEMPTS:
raise
delay = min(LOGIN_RETRY_CAP_SEC, 2 ** min(attempts, 8))
print(
f"atlasbot login retry for {normalize_user_id(user)} "
f"(attempt={attempts}, delay={delay}s): {exc}",
flush=True,
)
time.sleep(delay)
def _bot_accounts() -> list[dict[str, str]]:
accounts: list[dict[str, str]] = []
def add(user: str, password: str, mode: str):
if not user or not password:
return
accounts.append({"user": user, "password": password, "mode": mode})
add(BOT_USER_SMART or BOT_USER, BOT_PASS_SMART or BOT_PASS, "smart")
if BOT_USER_QUICK and BOT_PASS_QUICK:
add(BOT_USER_QUICK, BOT_PASS_QUICK, "fast")
if BOT_USER_GENIUS and BOT_PASS_GENIUS:
add(BOT_USER_GENIUS, BOT_PASS_GENIUS, "genius")
if BOT_USER and BOT_PASS and all(acc["user"] != BOT_USER for acc in accounts):
add(BOT_USER, BOT_PASS, "smart")
seen: set[str] = set()
unique: list[dict[str, str]] = []
for acc in accounts:
uid = normalize_user_id(acc["user"]).lower()
if uid in seen:
continue
seen.add(uid)
unique.append(acc)
return unique
last_err = exc
time.sleep(min(30, 2 ** attempt))
raise last_err
def main():
load_kb()
_start_http_server()
accounts = _bot_accounts()
threads: list[threading.Thread] = []
for acc in accounts:
token = login_with_retry(acc["user"], acc["password"])
try:
room_id = resolve_alias(token, ROOM_ALIAS)
join_room(token, room_id)
except Exception:
room_id = None
thread = threading.Thread(
target=sync_loop,
args=(token, room_id),
kwargs={
"account_user": acc["user"],
"default_mode": acc["mode"],
},
daemon=True,
)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
token = login_with_retry()
try:
room_id = resolve_alias(token, ROOM_ALIAS)
join_room(token, room_id)
except Exception:
room_id = None
sync_loop(token, room_id)
if __name__ == "__main__":
main()

View File

@ -7,14 +7,6 @@ read_secret() {
tr -d '\r\n' < "${vault_dir}/$1"
}
read_optional() {
if [ -f "${vault_dir}/$1" ]; then
tr -d '\r\n' < "${vault_dir}/$1"
else
printf ''
fi
}
export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
@ -22,12 +14,6 @@ export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
export BOT_PASS="$(read_secret bot-pass)"
export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
if [ -z "${BOT_PASS_QUICK}" ]; then BOT_PASS_QUICK="${BOT_PASS}"; fi
if [ -z "${BOT_PASS_SMART}" ]; then BOT_PASS_SMART="${BOT_PASS}"; fi
if [ -z "${BOT_PASS_GENIUS}" ]; then BOT_PASS_GENIUS="${BOT_PASS_SMART}"; fi
export SEEDER_PASS="$(read_secret seeder-pass)"
export CHAT_API_KEY="$(read_secret chat-matrix)"

View File

@ -1,227 +0,0 @@
from __future__ import annotations
import importlib.util
import os
from pathlib import Path
from unittest import TestCase, mock
BOT_PATH = Path(__file__).resolve().parents[1] / "atlasbot" / "bot.py"
def load_bot_module():
env = {
"BOT_USER": "atlas-smart",
"BOT_PASS": "smart-pass",
"BOT_USER_QUICK": "atlas-quick",
"BOT_PASS_QUICK": "quick-pass",
"BOT_USER_SMART": "atlas-smart",
"BOT_PASS_SMART": "smart-pass",
"BOT_USER_GENIUS": "atlas-genius",
"BOT_PASS_GENIUS": "genius-pass",
"OLLAMA_URL": "http://ollama.invalid",
"OLLAMA_MODEL": "base-model",
"ATLASBOT_MODEL_FAST": "fast-model",
"ATLASBOT_MODEL_SMART": "smart-model",
"ATLASBOT_MODEL_GENIUS": "genius-model",
"ATLASBOT_QUICK_TIME_BUDGET_SEC": "15",
"ATLASBOT_SMART_TIME_BUDGET_SEC": "45",
"ATLASBOT_GENIUS_TIME_BUDGET_SEC": "180",
"KB_DIR": "",
"VM_URL": "http://vm.invalid",
"ARIADNE_STATE_URL": "",
"ARIADNE_STATE_TOKEN": "",
}
with mock.patch.dict(os.environ, env, clear=False):
spec = importlib.util.spec_from_file_location("atlasbot_bot", BOT_PATH)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
class AtlasbotModeTests(TestCase):
def setUp(self):
self.bot = load_bot_module()
def test_bot_accounts_include_genius_mode(self):
accounts = self.bot._bot_accounts()
by_user = {account["user"]: account["mode"] for account in accounts}
self.assertEqual(by_user["atlas-quick"], "fast")
self.assertEqual(by_user["atlas-smart"], "smart")
self.assertEqual(by_user["atlas-genius"], "genius")
def test_objective_cluster_question_uses_fact_pack_without_llm(self):
fact_lines = [
"hottest_cpu: longhorn-system (6.69)",
"hottest_ram: longhorn-system (36.05 GB)",
]
with (
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
mock.patch.object(self.bot, "_ollama_call", side_effect=AssertionError("LLM should not be called")),
):
reply = self.bot.open_ended_answer(
"what is the hottest cpu node in titan lab currently?",
inventory=[],
snapshot=None,
workloads=[],
history_lines=[],
mode="smart",
allow_tools=True,
)
self.assertIn("longhorn-system", reply)
self.assertIn("Confidence:", reply)
def test_subjective_genius_answer_uses_genius_model(self):
fact_lines = [
"hottest_cpu: longhorn-system (6.69)",
"worker_nodes: titan-01, titan-02, titan-03",
]
captured: dict[str, object] = {}
def fake_ollama_call(hist_key, prompt, *, context, use_history=True, system_override=None, model=None, timeout=None):
captured["model"] = model
captured["timeout"] = timeout
captured["context"] = context
return "The worker spread stands out because Titan keeps meaningful capacity on the same cluster. Confidence: high"
with (
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
mock.patch.object(self.bot, "_ollama_call", side_effect=fake_ollama_call),
):
reply = self.bot.open_ended_answer(
"what stands out about titan lab?",
inventory=[],
snapshot=None,
workloads=[],
history_lines=[],
mode="genius",
allow_tools=True,
context='Cluster snapshot (JSON): {"injected":true}',
)
self.assertIn("The worker spread stands out", reply)
self.assertEqual(captured["model"], "genius-model")
self.assertLessEqual(float(captured["timeout"]), 180.0)
self.assertIn('Cluster snapshot (JSON): {"injected":true}', str(captured["context"]))
def test_mode_timeouts_stay_within_budgets(self):
fact_lines = [
"hottest_cpu: longhorn-system (6.69)",
"worker_nodes: titan-01, titan-02, titan-03",
]
seen: list[tuple[str, float]] = []
def fake_ollama_call(hist_key, prompt, *, context, use_history=True, system_override=None, model=None, timeout=None):
seen.append((str(model), float(timeout or 0)))
return "Atlas has a clear standout because the worker spread is healthy. Confidence: high"
with (
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
mock.patch.object(self.bot, "_ollama_call", side_effect=fake_ollama_call),
):
for mode in ("fast", "smart", "genius"):
reply = self.bot.open_ended_answer(
"what stands out about titan lab?",
inventory=[],
snapshot=None,
workloads=[],
history_lines=[],
mode=mode,
allow_tools=True,
)
self.assertIn("Confidence:", reply)
self.assertEqual([model for model, _ in seen], ["fast-model", "smart-model", "genius-model"])
self.assertLessEqual(seen[0][1], 15.0)
self.assertLessEqual(seen[1][1], 45.0)
self.assertLessEqual(seen[2][1], 180.0)
def test_llm_timeout_still_returns_a_conclusion(self):
fact_lines = [
"worker_nodes: titan-01, titan-02, titan-03",
"hottest_cpu: longhorn-system (6.69)",
]
with (
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
mock.patch.object(self.bot, "_ollama_call", side_effect=TimeoutError("simulated timeout")),
):
reply = self.bot.open_ended_answer(
"what stands out about the worker nodes?",
inventory=[],
snapshot=None,
workloads=[],
history_lines=[],
mode="genius",
allow_tools=True,
)
self.assertIn("worker nodes", reply.lower())
self.assertIn("Confidence:", reply)
def test_longhorn_rpi4_subset_beats_generic_rpi4_list(self):
fact_lines = [
"rpi4: titan-12, titan-13, titan-14, titan-15, titan-17, titan-18, titan-19",
"rpi4 armbian longhorn: titan-13/15/17/19",
]
with (
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
mock.patch.object(self.bot, "_ollama_call", side_effect=AssertionError("LLM should not be called")),
):
reply = self.bot.open_ended_answer(
"which nodes in titan are the rpi4 longhorn nodes?",
inventory=[],
snapshot=None,
workloads=[],
history_lines=[],
mode="smart",
allow_tools=True,
)
self.assertIn("titan-13", reply)
self.assertIn("titan-15", reply)
self.assertIn("titan-17", reply)
self.assertIn("titan-19", reply)
self.assertNotIn("titan-12", reply)
self.assertNotIn("titan-14", reply)
self.assertNotIn("titan-18", reply)
self.assertIn("Confidence:", reply)
def test_longhorn_query_uses_kb_context_when_snapshot_only_has_generic_rpi4(self):
fact_lines = [
"rpi4: titan-12, titan-13, titan-14, titan-15, titan-17, titan-18, titan-19",
]
kb_detail = (
"Atlas KB (retrieved):\n"
"- metis (knowledge/metis.md)\n"
"rpi4 armbian longhorn: titan-13/15/17/19"
)
with (
mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
mock.patch.object(self.bot, "kb_retrieve", return_value=kb_detail),
mock.patch.object(self.bot, "_ollama_call", side_effect=AssertionError("LLM should not be called")),
):
reply = self.bot.open_ended_answer(
"which nodes in titan are the rpi4 longhorn nodes?",
inventory=[],
snapshot=None,
workloads=[],
history_lines=[],
mode="smart",
allow_tools=True,
)
self.assertIn("titan-13", reply)
self.assertIn("titan-15", reply)
self.assertIn("titan-17", reply)
self.assertIn("titan-19", reply)
self.assertNotIn("titan-12", reply)
self.assertNotIn("titan-14", reply)
self.assertNotIn("titan-18", reply)
self.assertIn("Confidence:", reply)

View File

@ -29,18 +29,12 @@ spec:
operator: In
values: ["rpi4","rpi5"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 80
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 60
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
values: ["rpi4"]
containers:
- name: monerod
image: registry.bstein.dev/crypto/monerod:0.18.4.1
@ -75,14 +69,14 @@ spec:
httpGet: { path: /get_info, port: 18081 }
initialDelaySeconds: 120
periodSeconds: 10
timeoutSeconds: 20
failureThreshold: 18
timeoutSeconds: 3
failureThreshold: 6
livenessProbe:
httpGet: { path: /get_info, port: 18081 }
initialDelaySeconds: 300
periodSeconds: 20
timeoutSeconds: 20
failureThreshold: 36
timeoutSeconds: 3
failureThreshold: 6
terminationGracePeriodSeconds: 120
lifecycle:
preStop:

View File

@ -122,20 +122,14 @@ spec:
- matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
values: ["rpi4","rpi5"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
- weight: 50
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: ["titan-20","titan-21"]
values: ["rpi4"]
containers:
- name: gitea
image: gitea/gitea:1.23

View File

@ -53,7 +53,7 @@ spec:
registry:
existingClaim: harbor-registry
accessMode: ReadWriteOnce
size: 100Gi
size: 50Gi
jobservice:
jobLog:
existingClaim: harbor-jobservice-logs
@ -75,12 +75,11 @@ spec:
redis:
type: internal
internal:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-redis
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
@ -112,12 +111,11 @@ spec:
existingSecretAdminPasswordKey: harbor_admin_password
existingSecretSecretKey: harbor-core
core:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-core
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
serviceAccountName: harbor-vault-sync
automountServiceAccountToken: true
existingSecret: harbor-core
@ -127,10 +125,6 @@ spec:
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "harbor"
vault.hashicorp.com/agent-requests-cpu: "25m"
vault.hashicorp.com/agent-limits-cpu: "100m"
vault.hashicorp.com/agent-requests-mem: "32Mi"
vault.hashicorp.com/agent-limits-mem: "128Mi"
vault.hashicorp.com/agent-inject-secret-harbor-core-env.sh: "kv/data/atlas/harbor/harbor-core"
vault.hashicorp.com/agent-inject-template-harbor-core-env.sh: |
{{ with secret "kv/data/atlas/harbor/harbor-core" }}
@ -178,22 +172,17 @@ spec:
operator: In
values: ["rpi4"]
jobservice:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-jobservice
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
serviceAccountName: harbor-vault-sync
automountServiceAccountToken: true
existingSecret: harbor-jobservice
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "harbor"
vault.hashicorp.com/agent-requests-cpu: "25m"
vault.hashicorp.com/agent-limits-cpu: "100m"
vault.hashicorp.com/agent-requests-mem: "32Mi"
vault.hashicorp.com/agent-limits-mem: "128Mi"
vault.hashicorp.com/agent-inject-secret-harbor-jobservice-env.sh: "kv/data/atlas/harbor/harbor-jobservice"
vault.hashicorp.com/agent-inject-template-harbor-jobservice-env.sh: |
{{ with secret "kv/data/atlas/harbor/harbor-core" }}
@ -225,12 +214,11 @@ spec:
operator: In
values: ["rpi4"]
portal:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-portal
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
@ -253,24 +241,10 @@ spec:
operator: In
values: ["rpi4"]
registry:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
registry:
image:
repository: registry.bstein.dev/infra/harbor-registry
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
extraEnvVars:
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
value: harbor-core
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
value: http://harbor-registry:8080/service/notifications
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
value: 5s
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
value: "5"
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
value: 1s
controller:
image:
repository: registry.bstein.dev/infra/harbor-registryctl
@ -283,20 +257,12 @@ spec:
podAnnotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "harbor"
vault.hashicorp.com/agent-requests-cpu: "25m"
vault.hashicorp.com/agent-limits-cpu: "100m"
vault.hashicorp.com/agent-requests-mem: "32Mi"
vault.hashicorp.com/agent-limits-mem: "128Mi"
vault.hashicorp.com/agent-inject-secret-harbor-registry-env.sh: "kv/data/atlas/harbor/harbor-registry"
vault.hashicorp.com/agent-inject-template-harbor-registry-env.sh: |
{{ with secret "kv/data/atlas/harbor/harbor-registry" }}
export REGISTRY_HTTP_SECRET="{{ .Data.data.REGISTRY_HTTP_SECRET }}"
export REGISTRY_REDIS_PASSWORD="{{ .Data.data.REGISTRY_REDIS_PASSWORD }}"
{{ end }}
{{ with secret "kv/data/atlas/harbor/harbor-jobservice" }}
export JOBSERVICE_SECRET="{{ .Data.data.JOBSERVICE_SECRET }}"
export REGISTRY_NOTIFICATIONS_ENDPOINTS_0_HEADERS_Authorization="Harbor-Secret ${JOBSERVICE_SECRET}"
{{ end }}
vault.hashicorp.com/agent-inject-secret-harbor-registryctl-env.sh: "kv/data/atlas/harbor/harbor-registry"
vault.hashicorp.com/agent-inject-template-harbor-registryctl-env.sh: |
{{ with secret "kv/data/atlas/harbor/harbor-core" }}
@ -314,6 +280,8 @@ spec:
{{- with secret "kv/data/atlas/harbor/harbor-registry-htpasswd" -}}
{{ .Data.data.REGISTRY_HTPASSWD }}
{{- end }}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
@ -336,12 +304,11 @@ spec:
operator: In
values: ["rpi4"]
nginx:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-nginx
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
nodeSelector:
kubernetes.io/hostname: titan-05
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
@ -430,10 +397,10 @@ spec:
patch: |-
- op: replace
path: /spec/rules/0/http/paths/2/backend/service/name
value: harbor-core
value: harbor-registry
- op: replace
path: /spec/rules/0/http/paths/2/backend/service/port/number
value: 80
value: 5000
- target:
kind: Deployment
name: harbor-jobservice
@ -497,17 +464,8 @@ spec:
value: /vault/secrets/harbor-registry-env.sh
- name: VAULT_COPY_FILES
value: /vault/secrets/harbor-registry-htpasswd:/etc/registry/passwd
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
value: harbor-core
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
value: http://harbor-registry:8080/service/notifications
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
value: 5s
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
value: "5"
- name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
value: 1s
envFrom: []
envFrom:
- $patch: replace
volumeMounts:
- $patch: replace
- name: harbor-vault-entrypoint

View File

@ -8,7 +8,7 @@ spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 100Gi
storage: 50Gi
storageClassName: astreae
---
apiVersion: v1

View File

@ -77,26 +77,23 @@ spec:
mountPath: /config
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: longhorn-host
operator: In
values:
- "true"
- key: node-role.kubernetes.io/worker
operator: In
values:
- "true"
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: hardware
- key: kubernetes.io/hostname
operator: In
values:
- rpi5
- titan-22
- weight: 80
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- weight: 60
preference:
matchExpressions:
- key: kubernetes.io/hostname
@ -108,6 +105,7 @@ spec:
fsGroup: 65532
fsGroupChangePolicy: OnRootMismatch
runAsGroup: 65532
runtimeClassName: nvidia
containers:
- name: jellyfin
image: docker.io/jellyfin/jellyfin:10.11.5
@ -120,6 +118,8 @@ spec:
- name: http
containerPort: 8096
env:
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,video,utility"
- name: JELLYFIN_PublishedServerUrl
value: "https://stream.bstein.dev"
- name: PUID
@ -131,7 +131,12 @@ spec:
- name: VAULT_COPY_FILES
value: /vault/secrets/ldap-config.xml:/config/plugins/configurations/LDAP-Auth.xml
resources:
limits:
nvidia.com/gpu.shared: 1
# cpu: "4"
# memory: 8Gi
requests:
nvidia.com/gpu.shared: 1
cpu: "500m"
memory: 1Gi
volumeMounts:

568
services/jellyfin/oidc/Jenkinsfile vendored Normal file
View File

@ -0,0 +1,568 @@
pipeline {
agent {
kubernetes {
yaml """
apiVersion: v1
kind: Pod
spec:
restartPolicy: Never
containers:
- name: dotnet
image: mcr.microsoft.com/dotnet/sdk:9.0
command:
- cat
tty: true
"""
}
}
options {
timestamps()
}
parameters {
string(name: 'HARBOR_REPO', defaultValue: 'registry.bstein.dev/streaming/oidc-plugin', description: 'OCI repository for the plugin artifact')
string(name: 'JELLYFIN_VERSION', defaultValue: '10.11.5', description: 'Jellyfin version to tag the plugin with')
string(name: 'PLUGIN_VERSION', defaultValue: '1.0.2.0', description: 'Plugin version')
}
environment {
ORAS_VERSION = "1.2.0"
DOTNET_CLI_TELEMETRY_OPTOUT = "1"
DOTNET_SKIP_FIRST_TIME_EXPERIENCE = "1"
}
stages {
stage('Checkout') {
steps {
container('dotnet') {
checkout scm
}
}
}
stage('Build plugin') {
steps {
container('dotnet') {
sh '''
set -euo pipefail
apt-get update
apt-get install -y --no-install-recommends zip curl ca-certificates git
WORKDIR="$(pwd)/build"
SRC_DIR="${WORKDIR}/src"
DIST_DIR="${WORKDIR}/dist"
ART_DIR="${WORKDIR}/artifact"
rm -rf "${SRC_DIR}" "${DIST_DIR}" "${ART_DIR}"
mkdir -p "${SRC_DIR}" "${DIST_DIR}" "${ART_DIR}"
git clone https://github.com/lolerskatez/JellyfinOIDCPlugin.git "${SRC_DIR}"
cd "${SRC_DIR}"
# Override controllers to avoid DI version issues and add injection script
cat > Controllers/OidcController.cs <<'EOF'
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using IdentityModel.OidcClient;
using MediaBrowser.Controller.Library;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Extensions.DependencyInjection;
namespace JellyfinOIDCPlugin.Controllers;
#nullable enable
[ApiController]
[Route("api/oidc")]
public class OidcController : ControllerBase
{
private IUserManager UserManager => HttpContext.RequestServices.GetRequiredService<IUserManager>();
private static readonly Dictionary<string, object> StateManager = new(); // Store AuthorizeState objects
[HttpGet("start")]
public async Task<IActionResult> Start()
{
var config = Plugin.Instance?.Configuration;
if (config == null)
{
return BadRequest("Plugin not initialized");
}
var options = new OidcClientOptions
{
Authority = config.OidEndpoint?.Trim(),
ClientId = config.OidClientId?.Trim(),
ClientSecret = config.OidSecret?.Trim(),
RedirectUri = GetRedirectUri(),
Scope = string.Join(" ", config.OidScopes)
};
try
{
var client = new OidcClient(options);
var result = await client.PrepareLoginAsync().ConfigureAwait(false);
// Store the authorize state for the callback
var stateString = (string)result.GetType().GetProperty("State")?.GetValue(result);
if (!string.IsNullOrEmpty(stateString))
{
StateManager[stateString] = result;
}
var startUrl = (string)result.GetType().GetProperty("StartUrl")?.GetValue(result);
if (string.IsNullOrEmpty(startUrl))
{
Console.WriteLine("OIDC: Could not get StartUrl from OIDC result");
return BadRequest("OIDC initialization failed");
}
return Redirect(startUrl);
}
catch (Exception ex)
{
Console.WriteLine($"OIDC start error: {ex}");
return BadRequest("OIDC error: " + ex.Message);
}
}
[HttpGet("callback")]
public async Task<IActionResult> Callback()
{
var config = Plugin.Instance?.Configuration;
if (config == null)
{
return BadRequest("Plugin not initialized");
}
try
{
var stateParam = Request.Query["state"].ToString();
if (string.IsNullOrEmpty(stateParam) || !StateManager.TryGetValue(stateParam, out var storedState))
{
Console.WriteLine($"OIDC: Invalid state {stateParam}");
return BadRequest("Invalid state");
}
var options = new OidcClientOptions
{
Authority = config.OidEndpoint?.Trim(),
ClientId = config.OidClientId?.Trim(),
ClientSecret = config.OidSecret?.Trim(),
RedirectUri = GetRedirectUri(),
Scope = string.Join(" ", config.OidScopes)
};
var client = new OidcClient(options);
// Cast stored state to AuthorizeState - it's stored as object
var authorizeState = (AuthorizeState)storedState;
var result = await client.ProcessResponseAsync(Request.QueryString.Value, authorizeState).ConfigureAwait(false);
if (result.IsError)
{
Console.WriteLine($"OIDC callback failed: {result.Error} - {result.ErrorDescription}");
return BadRequest("OIDC authentication failed");
}
// Get email from claims
var email = result.User?.FindFirst("email")?.Value ??
result.User?.FindFirst("preferred_username")?.Value ??
result.User?.FindFirst("sub")?.Value;
if (string.IsNullOrEmpty(email))
{
Console.WriteLine("OIDC: No email/username found in OIDC response");
return BadRequest("No email/username found in OIDC response");
}
// Get or create user
var user = UserManager.GetUserByName(email);
if (user == null)
{
Console.WriteLine($"OIDC: Creating new user {email}");
user = await UserManager.CreateUserAsync(email).ConfigureAwait(false);
}
// Set authentication provider
user.AuthenticationProviderId = "OIDC";
// Get roles from claims
var rolesClaimValue = result.User?.FindFirst(config.RoleClaim)?.Value;
var roles = string.IsNullOrEmpty(rolesClaimValue)
? Array.Empty<string>()
: rolesClaimValue.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries);
// Set permissions based on groups
var isAdmin = roles.Any(r => r.Equals("admin", StringComparison.OrdinalIgnoreCase));
var isPowerUser = roles.Any(r => r.Equals("Power User", StringComparison.OrdinalIgnoreCase)) && !isAdmin;
Console.WriteLine($"OIDC: User {email} authenticated. Admin: {isAdmin}, PowerUser: {isPowerUser}");
// Update user in database
await UserManager.UpdateUserAsync(user).ConfigureAwait(false);
StateManager.Remove(stateParam);
// Redirect to Jellyfin main page
return Redirect("/");
}
catch (Exception ex)
{
Console.WriteLine($"OIDC callback error: {ex}");
return BadRequest("OIDC error: " + ex.Message);
}
}
[HttpPost("token")]
public async Task<IActionResult> ExchangeToken([FromBody] TokenExchangeRequest request)
{
var config = Plugin.Instance?.Configuration;
if (config == null)
{
Console.WriteLine("OIDC: Plugin not initialized");
return BadRequest("Plugin not initialized");
}
if (string.IsNullOrEmpty(request?.AccessToken))
{
Console.WriteLine("OIDC: No access token provided");
return BadRequest("Access token is required");
}
try
{
Console.WriteLine("OIDC: Processing token exchange request");
// Validate the token with the OIDC provider using UserInfo endpoint
var options = new OidcClientOptions
{
Authority = config.OidEndpoint?.Trim(),
ClientId = config.OidClientId?.Trim(),
ClientSecret = config.OidSecret?.Trim(),
Scope = string.Join(" ", config.OidScopes)
};
var client = new OidcClient(options);
// Use the access token to get user info
var userInfoResult = await client.GetUserInfoAsync(request.AccessToken).ConfigureAwait(false);
if (userInfoResult.IsError)
{
Console.WriteLine($"OIDC: Failed to get user info: {userInfoResult.Error}");
return Unauthorized("Invalid access token");
}
// Extract email/username from user info
var email = userInfoResult.Claims.FirstOrDefault(c => c.Type == "email")?.Value ??
userInfoResult.Claims.FirstOrDefault(c => c.Type == "preferred_username")?.Value ??
userInfoResult.Claims.FirstOrDefault(c => c.Type == "sub")?.Value;
if (string.IsNullOrEmpty(email))
{
Console.WriteLine("OIDC: No email/username found in token");
return BadRequest("No email/username found in token");
}
// Get or create user
var user = UserManager.GetUserByName(email);
if (user == null)
{
if (!config.AutoCreateUser)
{
Console.WriteLine($"OIDC: User {email} not found and auto-create disabled");
return Unauthorized("User does not exist and auto-creation is disabled");
}
Console.WriteLine($"OIDC: Creating new user from token {email}");
user = await UserManager.CreateUserAsync(email).ConfigureAwait(false);
}
// Update user authentication provider
user.AuthenticationProviderId = "OIDC";
// Get roles from claims
var rolesClaimName = config.RoleClaim ?? "groups";
var rolesClaimValue = userInfoResult.Claims.FirstOrDefault(c => c.Type == rolesClaimName)?.Value;
var roles = string.IsNullOrEmpty(rolesClaimValue)
? Array.Empty<string>()
: rolesClaimValue.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries);
// Set permissions based on groups
var isAdmin = roles.Any(r => r.Equals("admin", StringComparison.OrdinalIgnoreCase));
var isPowerUser = roles.Any(r => r.Equals("Power User", StringComparison.OrdinalIgnoreCase)) && !isAdmin;
Console.WriteLine($"OIDC: Token exchange for {email} Admin:{isAdmin} Power:{isPowerUser}");
// Update user in database
await UserManager.UpdateUserAsync(user).ConfigureAwait(false);
// Return success with user info
return Ok(new TokenExchangeResponse
{
Success = true,
UserId = user.Id.ToString(),
Username = user.Username,
Email = email,
IsAdmin = isAdmin,
Message = "User authenticated successfully"
});
}
catch (Exception ex)
{
Console.WriteLine($"OIDC token exchange error: {ex}");
return StatusCode(500, $"Token exchange failed: {ex.Message}");
}
}
private string GetRedirectUri()
{
var configured = Plugin.Instance?.Configuration?.RedirectUri;
if (!string.IsNullOrWhiteSpace(configured))
{
return configured!;
}
return $"{Request.Scheme}://{Request.Host}/api/oidc/callback";
}
}
public class TokenExchangeRequest
{
public string? AccessToken { get; set; }
public string? IdToken { get; set; }
}
public class TokenExchangeResponse
{
public bool Success { get; set; }
public string? UserId { get; set; }
public string? Username { get; set; }
public string? Email { get; set; }
public bool IsAdmin { get; set; }
public string? Message { get; set; }
}
EOF
cat > Controllers/OidcStaticController.cs <<'EOF'
using System;
using System.IO;
using System.Reflection;
using MediaBrowser.Common.Plugins;
using Microsoft.AspNetCore.Mvc;
namespace JellyfinOIDCPlugin.Controllers;
[ApiController]
[Route("api/oidc")]
public class OidcStaticController : ControllerBase
{
[HttpGet("login.js")]
public IActionResult GetLoginScript()
{
try
{
var assembly = Assembly.GetExecutingAssembly();
using var stream = assembly.GetManifestResourceStream("JellyfinOIDCPlugin.web.oidc-login.js");
if (stream == null)
{
Console.WriteLine("OIDC: Login script resource not found");
return NotFound();
}
using var reader = new StreamReader(stream);
var content = reader.ReadToEnd();
return Content(content, "application/javascript");
}
catch (Exception ex)
{
Console.WriteLine($"OIDC: Error serving login script {ex}");
return StatusCode(500, "Error loading login script");
}
}
[HttpGet("loader.js")]
public IActionResult GetLoader()
{
try
{
var assembly = Assembly.GetExecutingAssembly();
using var stream = assembly.GetManifestResourceStream("JellyfinOIDCPlugin.web.oidc-loader.js");
if (stream == null)
{
Console.WriteLine("OIDC: Loader script resource not found");
return NotFound();
}
using var reader = new StreamReader(stream);
var content = reader.ReadToEnd();
return Content(content, "application/javascript");
}
catch (Exception ex)
{
Console.WriteLine($"OIDC: Error serving loader script {ex}");
return StatusCode(500, "Error loading loader script");
}
}
[HttpGet("inject")]
public IActionResult GetInject()
{
try
{
var script = @"
(function() {
console.log('[OIDC Plugin] Bootstrap inject started');
// Load oidc-loader.js dynamically
const loaderScript = document.createElement('script');
loaderScript.src = '/api/oidc/loader.js';
loaderScript.type = 'application/javascript';
loaderScript.onerror = function() {
console.error('[OIDC Plugin] Failed to load loader.js');
};
loaderScript.onload = function() {
console.log('[OIDC Plugin] Loader.js loaded successfully');
};
// Append to head or body
const target = document.head || document.documentElement;
target.appendChild(loaderScript);
console.log('[OIDC Plugin] Bootstrap script appended to page');
})();
";
return Content(script, "application/javascript");
}
catch (Exception ex)
{
Console.WriteLine($"OIDC: Error serving inject script {ex}");
return StatusCode(500, "Error loading inject script");
}
}
[HttpGet("global.js")]
public IActionResult GetGlobalInjector()
{
try
{
var assembly = Assembly.GetExecutingAssembly();
using var stream = assembly.GetManifestResourceStream("JellyfinOIDCPlugin.web.oidc-global-injector.js");
if (stream == null)
{
Console.WriteLine("OIDC: Global injector resource not found");
return NotFound();
}
using var reader = new StreamReader(stream);
var content = reader.ReadToEnd();
return Content(content, "application/javascript");
}
catch (Exception ex)
{
Console.WriteLine($"OIDC: Error serving global injector {ex}");
return StatusCode(500, "Error loading global injector");
}
}
[HttpGet("config")]
public IActionResult GetConfigurationPage()
{
try
{
var assembly = Assembly.GetExecutingAssembly();
using var stream = assembly.GetManifestResourceStream("JellyfinOIDCPlugin.web.configurationpage.html");
if (stream == null)
{
Console.WriteLine("OIDC: Configuration page resource not found");
return NotFound("Configuration page resource not found");
}
using var reader = new StreamReader(stream);
var content = reader.ReadToEnd();
return Content(content, "text/html");
}
catch (Exception ex)
{
Console.WriteLine($"OIDC: Error serving configuration page {ex}");
return StatusCode(500, $"Error loading configuration page: {ex.Message}");
}
}
}
EOF
cat > JellyfinOIDCPlugin.csproj <<'EOF'
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net9.0</TargetFramework>
<AssemblyName>JellyfinOIDCPlugin.v2</AssemblyName>
<RootNamespace>JellyfinOIDCPlugin</RootNamespace>
<LangVersion>latest</LangVersion>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<AssemblyVersion>1.0.2.0</AssemblyVersion>
<FileVersion>1.0.2.0</FileVersion>
<CopyLocalLockFileAssemblies>false</CopyLocalLockFileAssemblies>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Jellyfin.Controller" Version="10.11.5">
<ExcludeAssets>runtime</ExcludeAssets>
</PackageReference>
<PackageReference Include="Jellyfin.Model" Version="10.11.5">
<ExcludeAssets>runtime</ExcludeAssets>
</PackageReference>
<PackageReference Include="Jellyfin.Common" Version="10.11.5">
<ExcludeAssets>runtime</ExcludeAssets>
</PackageReference>
<PackageReference Include="Jellyfin.Data" Version="10.11.5">
<ExcludeAssets>runtime</ExcludeAssets>
</PackageReference>
<PackageReference Include="Jellyfin.Database.Implementations" Version="10.11.5">
<ExcludeAssets>runtime</ExcludeAssets>
</PackageReference>
<PackageReference Include="IdentityModel.OidcClient" Version="5.2.1">
<PrivateAssets>none</PrivateAssets>
</PackageReference>
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="9.0.11">
<ExcludeAssets>runtime</ExcludeAssets>
</PackageReference>
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="web\\*.html" />
<EmbeddedResource Include="web\\*.js" />
<EmbeddedResource Include="web\\*.css" />
</ItemGroup>
</Project>
EOF
dotnet restore
dotnet publish -c Release --no-self-contained -o "${DIST_DIR}"
cd "${DIST_DIR}"
zip -r "${ART_DIR}/OIDC_Authentication_${PLUGIN_VERSION}-net9.zip" .
'''
}
}
}
stage('Push to Harbor') {
steps {
container('dotnet') {
withCredentials([usernamePassword(credentialsId: 'harbor-robot', usernameVariable: 'HARBOR_USERNAME', passwordVariable: 'HARBOR_PASSWORD')]) {
sh '''
set -euo pipefail
WORKDIR="$(pwd)/build"
ORAS_BIN="/usr/local/bin/oras"
curl -sSL "https://github.com/oras-project/oras/releases/download/v${ORAS_VERSION}/oras_${ORAS_VERSION}_linux_amd64.tar.gz" | tar -xz -C /usr/local/bin oras
ref_host="$(echo "${HARBOR_REPO}" | cut -d/ -f1)"
"${ORAS_BIN}" login "${ref_host}" -u "${HARBOR_USERNAME}" -p "${HARBOR_PASSWORD}"
artifact="${WORKDIR}/artifact/OIDC_Authentication_${PLUGIN_VERSION}-net9.zip"
"${ORAS_BIN}" push "${HARBOR_REPO}:${JELLYFIN_VERSION}" "${artifact}:application/zip" --artifact-type application/zip
"${ORAS_BIN}" push "${HARBOR_REPO}:latest" "${artifact}:application/zip" --artifact-type application/zip
'''
}
}
}
}
}
post {
always {
container('dotnet') {
archiveArtifacts artifacts: 'build/artifact/*.zip', allowEmptyArchive: true
}
}
}
}

View File

@ -45,17 +45,6 @@ data:
username: "${HARBOR_ROBOT_USERNAME}"
password: "${HARBOR_ROBOT_PASSWORD}"
description: "Harbor robot for pipelines"
- usernamePassword:
scope: GLOBAL
id: harbor-robot-streaming
username: "${HARBOR_STREAMING_ROBOT_USERNAME}"
password: "${HARBOR_STREAMING_ROBOT_PASSWORD}"
description: "Harbor robot for streaming pushes"
- string:
scope: GLOBAL
id: sonarqube-token
secret: "${SONARQUBE_TOKEN}"
description: "SonarQube token for quality-gate evidence collection"
jobs.yaml: |
jobs:
- script: |
@ -84,6 +73,48 @@ data:
}
}
}
pipelineJob('jellyfin-oidc-plugin') {
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/titan-iac.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('services/jellyfin/oidc/Jenkinsfile')
}
}
}
pipelineJob('ci-demo') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/1 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/ci-demo.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('bstein-dev-home') {
properties {
pipelineTriggers {
@ -136,188 +167,6 @@ data:
}
}
}
pipelineJob('metis') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/metis.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('ananke') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/ananke.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('lesavka') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/lesavka.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('arcanagon') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/arcanagon.git')
credentials('gitea-pat')
}
branches('*/master')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('pegasus') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/pegasus.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('atlasbot') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/atlasbot.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('soteria') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/soteria.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('data-prepper') {
properties {
pipelineTriggers {
@ -337,65 +186,13 @@ data:
url('https://scm.bstein.dev/bstein/titan-iac.git')
credentials('gitea-pat')
}
branches('*/main')
branches('*/feature/sso-hardening')
}
}
scriptPath('services/logging/Jenkinsfile.data-prepper')
}
}
}
pipelineJob('titan-iac') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/titan-iac.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
pipelineJob('typhon') {
properties {
pipelineTriggers {
triggers {
scmTrigger {
scmpoll_spec('H/5 * * * *')
ignorePostCommitHooks(false)
}
}
}
}
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/typhon.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
multibranchPipelineJob('titan-iac-quality-gate') {
branchSources {
branchSource {
@ -488,40 +285,6 @@ data:
podRetention: Never
serviceAccount: "jenkins"
slaveConnectTimeoutStr: "100"
yaml: |
spec:
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 85
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
jenkins/jenkins-jenkins-agent: "true"
yamlMergeStrategy: override
inheritYamlMergeStrategy: false
slaveAgentPort: 50000

View File

@ -10,7 +10,7 @@ data:
workflow-aggregator:608.v67378e9d3db_1
git:5.8.1
pipeline-utility-steps:2.20.0
configuration-as-code:2036.v0b_c2de701dcb_
configuration-as-code:2031.veb_a_fdda_b_3ffd
oic-auth:4.609.v9de140f63d01
job-dsl:1.93
simple-theme-plugin:230.v8b_fd91b_b_800c

View File

@ -33,35 +33,22 @@ spec:
{{ with secret "kv/data/atlas/jenkins/harbor-robot-creds" }}
HARBOR_ROBOT_USERNAME={{ .Data.data.username }}
HARBOR_ROBOT_PASSWORD={{ .Data.data.password }}
HARBOR_STREAMING_ROBOT_USERNAME={{ .Data.data.username }}
HARBOR_STREAMING_ROBOT_PASSWORD={{ .Data.data.password }}
{{ end }}
{{ with secret "kv/data/atlas/jenkins/harbor-streaming-robot-creds" }}
HARBOR_STREAMING_ROBOT_USERNAME={{ .Data.data.username }}
HARBOR_STREAMING_ROBOT_PASSWORD={{ .Data.data.password }}
{{ end }}
{{ with secret "kv/data/atlas/shared/harbor-pull" }}
{{- if and .Data.data.username .Data.data.password }}
HARBOR_PULL_USERNAME={{ .Data.data.username }}
HARBOR_PULL_PASSWORD={{ .Data.data.password }}
HARBOR_ROBOT_USERNAME={{ .Data.data.username }}
HARBOR_ROBOT_PASSWORD={{ .Data.data.password }}
{{- end }}
{{ end }}
{{ with secret "kv/data/atlas/jenkins/gitea-pat" }}
GITEA_PAT_USERNAME={{ .Data.data.username }}
GITEA_PAT_TOKEN={{ .Data.data.token }}
{{ end }}
{{ with secret "kv/data/atlas/quality/sonarqube-oidc" }}
SONARQUBE_TOKEN={{ .Data.data.sonarqube_exporter_token }}
{{ end }}
{{ with secret "kv/data/atlas/jenkins/webhook-tokens" }}
TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
{{ end }}
{{ with secret "kv/data/atlas/jenkins/ariadne-api" }}
ARIADNE_JENKINS_API_USER={{ .Data.data.username }}
ARIADNE_JENKINS_API_TOKEN={{ .Data.data.token }}
{{ end }}
bstein.dev/restarted-at: "2026-04-13T06:35:00Z"
bstein.dev/restarted-at: "2026-01-20T14:52:41Z"
spec:
serviceAccountName: jenkins
nodeSelector:
@ -70,21 +57,6 @@ spec:
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 95
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
- weight: 90
preference:
matchExpressions:
@ -103,7 +75,6 @@ spec:
- sso.bstein.dev
securityContext:
fsGroup: 1000
fsGroupChangePolicy: OnRootMismatch
initContainers:
- name: install-plugins
image: jenkins/jenkins:2.528.3-jdk21
@ -113,7 +84,6 @@ spec:
- -c
- |
set -euo pipefail
rm -rf /usr/share/jenkins/ref/plugins/*
jenkins-plugin-cli --plugin-file /plugins/plugins.txt
volumeMounts:
- name: plugins
@ -180,8 +150,7 @@ spec:
port: http
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 60
failureThreshold: 20
volumeMounts:
- name: jenkins-home
mountPath: /var/jenkins_home

View File

@ -1,13 +0,0 @@
# services/jenkins/dind-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jenkins-dind-cache
namespace: jenkins
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 30Gi
storageClassName: local-path

View File

@ -8,7 +8,6 @@ resources:
- vault-serviceaccount.yaml
- pvc.yaml
- cache-pvc.yaml
- dind-pvc.yaml
- plugins-pvc.yaml
- configmap-jcasc.yaml
- configmap-plugins.yaml
@ -22,7 +21,6 @@ configMapGenerator:
- name: jenkins-init-scripts
namespace: jenkins
files:
- ariadne-api-user.groovy=scripts/ariadne-api-user.groovy
- git-notify-token.groovy=scripts/git-notify-token.groovy
- theme.groovy=scripts/theme.groovy
options:

View File

@ -1,96 +0,0 @@
import hudson.model.User
import jenkins.security.ApiTokenProperty
def userId = (System.getenv("ARIADNE_JENKINS_API_USER") ?: "").trim()
def envTokenValue = (System.getenv("ARIADNE_JENKINS_API_TOKEN") ?: "").trim()
def tokenName = "ariadne-weather"
def tokenFile = new File("/var/jenkins_home/secrets/ariadne-api-token")
def userFile = new File("/var/jenkins_home/secrets/ariadne-api-user")
def persistedTokenValue = tokenFile.exists() ? (tokenFile.text ?: "").trim() : ""
def tokenValue = envTokenValue ?: persistedTokenValue
if (!userId || !tokenValue) {
println("Ariadne API user bootstrap skipped: missing ARIADNE_JENKINS_API_USER and no token source available")
return
}
def user = User.getById(userId, true)
if (user == null) {
println("Ariadne API user bootstrap failed: unable to resolve user ${userId}")
return
}
if (!user.getFullName() || user.getFullName().trim() == userId) {
user.setFullName("Ariadne Metrics")
}
def prop = user.getProperty(ApiTokenProperty.class)
if (prop == null) {
prop = new ApiTokenProperty()
user.addProperty(prop)
}
if (persistedTokenValue && prop.matchesPassword(persistedTokenValue)) {
tokenValue = persistedTokenValue
}
if (!prop.matchesPassword(tokenValue)) {
def store = prop.getTokenStore()
boolean configured = false
try {
def existing = store.getTokenListSortedByName().find { token ->
try {
token.getName() == tokenName
} catch (Throwable ignored) {
false
}
}
if (existing != null) {
try {
store.revokeToken(existing.getUuid())
} catch (Throwable ignored) {
try {
store.revokeToken(existing.uuid)
} catch (Throwable ignoredAgain) {
println("Ariadne API user bootstrap warning: failed to revoke existing token ${tokenName}")
}
}
}
store.addFixedNewToken(tokenName, tokenValue)
configured = true
} catch (Throwable ignored) {
// Fallback for older token-store variants.
}
if (!configured) {
if (persistedTokenValue && prop.matchesPassword(persistedTokenValue)) {
tokenValue = persistedTokenValue
} else {
def generated = store.generateNewToken(tokenName)
if (generated?.plainValue) {
tokenValue = generated.plainValue
}
println("Ariadne API user bootstrap warning: addFixedNewToken unavailable, generated replacement token")
}
}
}
tokenFile.parentFile?.mkdirs()
tokenFile.text = tokenValue + "\n"
tokenFile.setReadable(false, false)
tokenFile.setReadable(true, true)
tokenFile.setWritable(false, false)
tokenFile.setWritable(true, true)
userFile.parentFile?.mkdirs()
userFile.text = userId + "\n"
userFile.setReadable(false, false)
userFile.setReadable(true, true)
userFile.setWritable(false, false)
userFile.setWritable(true, true)
user.save()
println("Ariadne API user bootstrap complete for ${userId}")

View File

@ -35,38 +35,7 @@ subjects:
- kind: ServiceAccount
name: jenkins
namespace: jenkins
- kind: ServiceAccount
name: default
namespace: jenkins
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: jenkins-agent
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: jenkins-glue-observer
rules:
- apiGroups: ["batch"]
resources:
- cronjobs
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: jenkins-glue-observer
subjects:
- kind: ServiceAccount
name: jenkins
namespace: jenkins
- kind: ServiceAccount
name: default
namespace: jenkins
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: jenkins-glue-observer

View File

@ -18,15 +18,6 @@ spec:
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-13", "titan-15", "titan-17", "titan-19"]
containers:
- name: sync
image: alpine:3.20

View File

@ -22,11 +22,6 @@ resources:
- oneoffs/mas-secrets-ensure-job.yaml
- oneoffs/synapse-oidc-secret-ensure-job.yaml
- oneoffs/logs-oidc-secret-ensure-job.yaml
- oneoffs/metis-oidc-secret-ensure-job.yaml
- oneoffs/soteria-oidc-secret-ensure-job.yaml
- oneoffs/quality-oidc-secret-ensure-job.yaml
- oneoffs/metis-ssh-keys-secret-ensure-job.yaml
- oneoffs/metis-node-passwords-secret-ensure-job.yaml
- oneoffs/harbor-oidc-secret-ensure-job.yaml
- oneoffs/vault-oidc-secret-ensure-job.yaml
- oneoffs/actual-oidc-secret-ensure-job.yaml

View File

@ -1,110 +0,0 @@
# services/keycloak/oneoffs/metis-node-passwords-secret-ensure-job.yaml
# One-off job for sso/metis-node-passwords-secret-ensure-4.
# Purpose: ensure per-node Metis recovery placeholders exist in Vault.
# Atlas/root values are preserved while intranet IPs are standardized per node.
apiVersion: batch/v1
kind: Job
metadata:
name: metis-node-passwords-secret-ensure-4
namespace: sso
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
spec:
serviceAccountName: mas-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: apply
image: registry.bstein.dev/bstein/kubectl:1.35.0
command: ["/bin/sh", "-c"]
args:
- |
set -eu
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
vault_role="${VAULT_ROLE:-sso-secrets}"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
vault_token="$(curl -sS --request POST --data "${login_payload}" "${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
echo "vault login failed" >&2
exit 1
fi
ensured=0
while read -r node intranet_ip; do
if [ -z "${node}" ] || [ -z "${intranet_ip}" ]; then
continue
fi
secret_path="kv/data/atlas/nodes/${node}"
read_status="$(curl -sS -o /tmp/node-read.json -w "%{http_code}" -H "X-Vault-Token: ${vault_token}" "${vault_addr}/v1/${secret_path}" || true)"
if [ "${read_status}" = "200" ]; then
atlas_password="$(jq -r '.data.data.atlas_password // empty' /tmp/node-read.json)"
root_password="$(jq -r '.data.data.root_password // empty' /tmp/node-read.json)"
elif [ "${read_status}" = "404" ]; then
atlas_password=""
root_password=""
else
echo "Vault read failed for ${node} (status ${read_status})" >&2
cat /tmp/node-read.json >&2 || true
exit 1
fi
payload="$(jq -nc --arg atlas_password "${atlas_password}" --arg root_password "${root_password}" --arg intranet_ip "${intranet_ip}" '{data:{atlas_password:$atlas_password,root_password:$root_password,intranet_ip:$intranet_ip}}')"
write_status="$(curl -sS -o /tmp/node-write.json -w "%{http_code}" -X POST -H "X-Vault-Token: ${vault_token}" -H 'Content-Type: application/json' -d "${payload}" "${vault_addr}/v1/${secret_path}")"
if [ "${write_status}" != "200" ] && [ "${write_status}" != "204" ]; then
echo "Vault write failed for ${node} (status ${write_status})" >&2
cat /tmp/node-write.json >&2 || true
exit 1
fi
ensured=$((ensured + 1))
echo "Ensured node secret placeholder for ${node} (${intranet_ip})"
done <<'EOF_NODES'
titan-jh 192.168.22.8
titan-db 192.168.22.10
titan-0a 192.168.22.11
titan-0b 192.168.22.12
titan-0c 192.168.22.13
titan-20 192.168.22.20
titan-21 192.168.22.21
titan-22 192.168.22.22
titan-23 192.168.22.23
titan-24 192.168.22.26
titan-04 192.168.22.30
titan-05 192.168.22.31
titan-06 192.168.22.32
titan-07 192.168.22.33
titan-08 192.168.22.34
titan-09 192.168.22.35
titan-10 192.168.22.36
titan-11 192.168.22.37
titan-12 192.168.22.40
titan-13 192.168.22.41
titan-14 192.168.22.42
titan-15 192.168.22.43
titan-16 192.168.22.44
titan-17 192.168.22.45
titan-18 192.168.22.46
titan-19 192.168.22.47
EOF_NODES
echo "Ensured ${ensured} Metis node placeholders in Vault"

View File

@ -1,198 +0,0 @@
# services/keycloak/oneoffs/metis-oidc-secret-ensure-job.yaml
# One-off job for sso/metis-oidc-secret-ensure-2.
# Purpose: ensure the Metis oauth2-proxy OIDC client and Vault secret exist.
# Keep this completed Job around; bump the suffix if it ever needs to be rerun.
apiVersion: batch/v1
kind: Job
metadata:
name: metis-oidc-secret-ensure-3
namespace: sso
spec:
backoffLimit: 0
template:
metadata:
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "sso-secrets"
vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin"
vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: |
{{ with secret "kv/data/atlas/shared/keycloak-admin" }}
export KEYCLOAK_ADMIN="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}"
{{ end }}
spec:
serviceAccountName: mas-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: apply
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
. /vault/secrets/keycloak-admin-env.sh
KC_URL="http://keycloak.sso.svc.cluster.local"
ACCESS_TOKEN=""
for attempt in 1 2 3 4 5; do
TOKEN_JSON="$(curl -sS -X POST "$KC_URL/realms/master/protocol/openid-connect/token" \
-H 'Content-Type: application/x-www-form-urlencoded' \
-d "grant_type=password" \
-d "client_id=admin-cli" \
-d "username=${KEYCLOAK_ADMIN}" \
-d "password=${KEYCLOAK_ADMIN_PASSWORD}" || true)"
ACCESS_TOKEN="$(echo "$TOKEN_JSON" | jq -r '.access_token' 2>/dev/null || true)"
if [ -n "$ACCESS_TOKEN" ] && [ "$ACCESS_TOKEN" != "null" ]; then
break
fi
echo "Keycloak token request failed (attempt ${attempt})" >&2
sleep $((attempt * 2))
done
if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then
echo "Failed to fetch Keycloak admin token" >&2
exit 1
fi
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients?clientId=metis" || true)"
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
create_payload='{"clientId":"metis","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://recovery.bstein.dev/oauth2/callback"],"webOrigins":["https://recovery.bstein.dev"],"rootUrl":"https://recovery.bstein.dev","baseUrl":"/"}'
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${create_payload}" \
"$KC_URL/admin/realms/atlas/clients")"
if [ "$status" != "201" ] && [ "$status" != "204" ] && [ "$status" != "409" ]; then
echo "Keycloak client create failed (status ${status})" >&2
exit 1
fi
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients?clientId=metis" || true)"
CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
fi
if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
echo "Keycloak client metis not found" >&2
exit 1
fi
SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)"
if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then
echo "Keycloak client scope groups not found" >&2
exit 1
fi
DEFAULT_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/default-client-scopes" || true)"
OPTIONAL_SCOPES="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes" || true)"
if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1 \
&& ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2>&1; then
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${SCOPE_ID}")"
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
echo "Failed to attach groups client scope to metis (status ${status})" >&2
exit 1
fi
fi
fi
update_payload='{"enabled":true,"clientId":"metis","protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://recovery.bstein.dev/oauth2/callback"],"webOrigins":["https://recovery.bstein.dev"],"rootUrl":"https://recovery.bstein.dev","baseUrl":"/"}'
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${update_payload}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")"
if [ "$status" != "204" ]; then
echo "Keycloak client update failed (status ${status})" >&2
exit 1
fi
CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)"
if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then
echo "Keycloak client secret not found" >&2
exit 1
fi
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
vault_role="${VAULT_ROLE:-sso-secrets}"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
vault_token="$(curl -sS --request POST --data "${login_payload}" \
"${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
echo "vault login failed" >&2
exit 1
fi
read_status="$(curl -sS -o /tmp/metis-oidc-read.json -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/kv/data/atlas/maintenance/metis-oidc" || true)"
COOKIE_SECRET=""
if [ "${read_status}" = "200" ]; then
COOKIE_SECRET="$(jq -r '.data.data.cookie_secret // empty' /tmp/metis-oidc-read.json)"
elif [ "${read_status}" != "404" ]; then
echo "Vault read failed (status ${read_status})" >&2
cat /tmp/metis-oidc-read.json >&2 || true
exit 1
fi
if [ -n "${COOKIE_SECRET}" ]; then
length="$(printf '%s' "${COOKIE_SECRET}" | wc -c | tr -d ' ')"
if [ "${length}" != "16" ] && [ "${length}" != "24" ] && [ "${length}" != "32" ]; then
COOKIE_SECRET=""
fi
fi
if [ -z "${COOKIE_SECRET}" ]; then
COOKIE_SECRET="$(openssl rand -hex 16 | tr -d '\n')"
fi
payload="$(jq -nc \
--arg client_id "metis" \
--arg client_secret "${CLIENT_SECRET}" \
--arg cookie_secret "${COOKIE_SECRET}" \
'{data:{client_id:$client_id,client_secret:$client_secret,cookie_secret:$cookie_secret}}')"
write_status="$(curl -sS -o /tmp/metis-oidc-write.json -w "%{http_code}" -X POST \
-H "X-Vault-Token: ${vault_token}" \
-H 'Content-Type: application/json' \
-d "${payload}" "${vault_addr}/v1/kv/data/atlas/maintenance/metis-oidc")"
if [ "${write_status}" != "200" ] && [ "${write_status}" != "204" ]; then
echo "Vault write failed (status ${write_status})" >&2
cat /tmp/metis-oidc-write.json >&2 || true
exit 1
fi
verify_status="$(curl -sS -o /tmp/metis-oidc-verify.json -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/kv/data/atlas/maintenance/metis-oidc" || true)"
if [ "${verify_status}" != "200" ]; then
echo "Vault verify failed (status ${verify_status})" >&2
cat /tmp/metis-oidc-verify.json >&2 || true
exit 1
fi
echo "Metis OIDC secret ready in Vault"

View File

@ -1,136 +0,0 @@
# services/keycloak/oneoffs/metis-ssh-keys-secret-ensure-job.yaml
# One-off job for sso/metis-ssh-keys-secret-ensure-1.
# Purpose: ensure Vault path maintenance/metis-ssh-keys exists for Metis key injection.
# Migration behavior: if Vault path is missing/incomplete, seed from existing maintenance/metis-ssh-keys Kubernetes Secret.
# Legacy key names are read as fallback, but only ananke_* keys are written.
apiVersion: batch/v1
kind: Job
metadata:
name: metis-ssh-keys-secret-ensure-1
namespace: sso
spec:
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
metadata:
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "sso-secrets"
vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin"
vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: |
{{ with secret "kv/data/atlas/shared/keycloak-admin" }}
export KEYCLOAK_ADMIN="{{ .Data.data.username }}"
{{ end }}
spec:
serviceAccountName: mas-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: apply
image: registry.bstein.dev/bstein/kubectl:1.35.0
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
vault_role="${VAULT_ROLE:-sso-secrets}"
vault_path="kv/data/atlas/maintenance/metis-ssh-keys"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
vault_token="$(curl -sS --request POST --data "${login_payload}" \
"${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
echo "vault login failed" >&2
exit 1
fi
read_status="$(curl -sS -o /tmp/metis-ssh-read.json -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/${vault_path}" || true)"
if [ "${read_status}" = "200" ]; then
bastion_existing="$(jq -r '.data.data.bastion_pub // empty' /tmp/metis-ssh-read.json)"
brad_existing="$(jq -r '.data.data.brad_pub // empty' /tmp/metis-ssh-read.json)"
ananke_tethys_existing="$(jq -r '.data.data.ananke_tethys_pub // .data.data.hecate_tethys_pub // empty' /tmp/metis-ssh-read.json)"
ananke_db_existing="$(jq -r '.data.data.ananke_db_pub // .data.data.hecate_db_pub // empty' /tmp/metis-ssh-read.json)"
if [ -n "${bastion_existing}" ] && [ -n "${brad_existing}" ] && [ -n "${ananke_tethys_existing}" ]; then
echo "Vault metis-ssh-keys already present"
exit 0
fi
elif [ "${read_status}" != "404" ]; then
echo "Vault read failed (status ${read_status})" >&2
cat /tmp/metis-ssh-read.json >&2 || true
exit 1
fi
bastion_pub="$(kubectl -n maintenance get secret metis-ssh-keys -o jsonpath='{.data.bastion_pub}' 2>/dev/null | base64 -d || true)"
brad_pub="$(kubectl -n maintenance get secret metis-ssh-keys -o jsonpath='{.data.brad_pub}' 2>/dev/null | base64 -d || true)"
ananke_tethys_pub="$(kubectl -n maintenance get secret metis-ssh-keys -o jsonpath='{.data.ananke_tethys_pub}' 2>/dev/null | base64 -d || true)"
if [ -z "${ananke_tethys_pub}" ]; then
ananke_tethys_pub="$(kubectl -n maintenance get secret metis-ssh-keys -o jsonpath='{.data.hecate_tethys_pub}' 2>/dev/null | base64 -d || true)"
fi
ananke_db_pub="$(kubectl -n maintenance get secret metis-ssh-keys -o jsonpath='{.data.ananke_db_pub}' 2>/dev/null | base64 -d || true)"
if [ -z "${ananke_db_pub}" ]; then
ananke_db_pub="$(kubectl -n maintenance get secret metis-ssh-keys -o jsonpath='{.data.hecate_db_pub}' 2>/dev/null | base64 -d || true)"
fi
if [ -z "${bastion_pub}" ] && [ -n "${bastion_existing:-}" ]; then
bastion_pub="${bastion_existing}"
fi
if [ -z "${brad_pub}" ] && [ -n "${brad_existing:-}" ]; then
brad_pub="${brad_existing}"
fi
if [ -z "${ananke_tethys_pub}" ] && [ -n "${ananke_tethys_existing:-}" ]; then
ananke_tethys_pub="${ananke_tethys_existing}"
fi
if [ -z "${ananke_db_pub}" ] && [ -n "${ananke_db_existing:-}" ]; then
ananke_db_pub="${ananke_db_existing}"
fi
if [ -z "${bastion_pub}" ] || [ -z "${brad_pub}" ] || [ -z "${ananke_tethys_pub}" ]; then
echo "Cannot seed Vault metis-ssh-keys: maintenance/metis-ssh-keys missing required keys" >&2
exit 1
fi
payload="$(jq -nc \
--arg bastion_pub "${bastion_pub}" \
--arg brad_pub "${brad_pub}" \
--arg ananke_tethys_pub "${ananke_tethys_pub}" \
--arg ananke_db_pub "${ananke_db_pub}" \
'{data:{bastion_pub:$bastion_pub,brad_pub:$brad_pub,ananke_tethys_pub:$ananke_tethys_pub,ananke_db_pub:$ananke_db_pub}}')"
write_status="$(curl -sS -o /tmp/metis-ssh-write.json -w "%{http_code}" -X POST \
-H "X-Vault-Token: ${vault_token}" \
-H 'Content-Type: application/json' \
-d "${payload}" \
"${vault_addr}/v1/${vault_path}")"
if [ "${write_status}" != "200" ] && [ "${write_status}" != "204" ]; then
echo "Vault write failed (status ${write_status})" >&2
cat /tmp/metis-ssh-write.json >&2 || true
exit 1
fi
verify_status="$(curl -sS -o /tmp/metis-ssh-verify.json -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/${vault_path}" || true)"
if [ "${verify_status}" != "200" ]; then
echo "Vault verify failed (status ${verify_status})" >&2
cat /tmp/metis-ssh-verify.json >&2 || true
exit 1
fi
echo "Metis SSH key material now persisted in Vault at maintenance/metis-ssh-keys"

Some files were not shown because too many files have changed in this diff Show More