Compare commits

..

No commits in common. "main" and "codex/ananke-gate-platform-metrics" have entirely different histories.

64 changed files with 1157 additions and 4539 deletions

2
.gitignore vendored
View File

@ -1,6 +1,4 @@
/bin/ /bin/
/build/
/dist/ /dist/
internal/state/.corrupt-*
*.log *.log
*.tmp *.tmp

201
Jenkinsfile vendored
View File

@ -1,59 +1,25 @@
pipeline { pipeline {
agent { agent {
kubernetes { kubernetes {
label 'ananke-quality'
defaultContainer 'go-tester' defaultContainer 'go-tester'
yaml """ yaml """
apiVersion: v1 apiVersion: v1
kind: Pod kind: Pod
spec: spec:
nodeSelector: nodeSelector:
hardware: rpi5
kubernetes.io/arch: arm64 kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true" node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-06
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-13
- titan-15
- titan-17
- titan-19
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
labelSelector:
matchLabels:
jenkins/jenkins-jenkins-agent: "true"
containers: containers:
- name: go-tester - name: go-tester
image: registry.bstein.dev/bstein/golang:1.25-bookworm image: golang:1.25-bookworm
command: ["cat"] command: ["cat"]
tty: true tty: true
volumeMounts: volumeMounts:
- name: workspace-volume - name: workspace-volume
mountPath: /home/jenkins/agent mountPath: /home/jenkins/agent
- name: publisher - name: publisher
image: registry.bstein.dev/bstein/python:3.12-slim image: python:3.12-slim
command: ["cat"]
tty: true
volumeMounts:
- name: workspace-volume
mountPath: /home/jenkins/agent
- name: quality-tools
image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
command: ["cat"] command: ["cat"]
tty: true tty: true
volumeMounts: volumeMounts:
@ -69,13 +35,7 @@ spec:
environment { environment {
SUITE_NAME = 'ananke' SUITE_NAME = 'ananke'
PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091' PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
SONARQUBE_PROJECT_KEY = 'ananke'
SONARQUBE_TOKEN = credentials('sonarqube-token')
QUALITY_GATE_SONARQUBE_ENFORCE = '1'
QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json' QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
QUALITY_GATE_IRONBANK_ENFORCE = '1'
QUALITY_GATE_IRONBANK_REQUIRED = '0'
QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json' QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
} }
@ -97,27 +57,6 @@ spec:
stage('Collect SonarQube evidence') { stage('Collect SonarQube evidence') {
steps { steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
args=(
"-Dsonar.host.url=${SONARQUBE_HOST_URL}"
"-Dsonar.login=${SONARQUBE_TOKEN}"
"-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
"-Dsonar.sources=."
"-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**"
"-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
)
[ -f build/coverage.out ] && args+=("-Dsonar.go.coverage.reportPaths=build/coverage.out")
set +e
sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
rc=${PIPESTATUS[0]}
set -e
printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
'''
}
container('publisher') { container('publisher') {
sh ''' sh '''
set -eu set -eu
@ -156,34 +95,6 @@ PY
stage('Collect Supply Chain evidence') { stage('Collect Supply Chain evidence') {
steps { steps {
container('quality-tools') {
sh '''#!/usr/bin/env bash
set -euo pipefail
mkdir -p build
set +e
trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
trivy_rc=$?
set -e
if [ ! -s build/trivy-fs.json ]; then
cat > build/ironbank-compliance.json <<EOF
{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
EOF
exit 0
fi
critical="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="CRITICAL")] | length' build/trivy-fs.json)"
high="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="HIGH")] | length' build/trivy-fs.json)"
secrets="$(jq '[.Results[]? | .Secrets[]?] | length' build/trivy-fs.json)"
misconfigs="$(jq '[.Results[]? | .Misconfigurations[]? | select(.Status=="FAIL" and (.Severity=="CRITICAL" or .Severity=="HIGH"))] | length' build/trivy-fs.json)"
status=ok
compliant=true
if [ "${critical}" -gt 0 ] || [ "${secrets}" -gt 0 ] || [ "${misconfigs}" -gt 0 ]; then
status=failed
compliant=false
fi
jq -n --arg status "${status}" --argjson compliant "${compliant}" --argjson critical "${critical}" --argjson high "${high}" --argjson secrets "${secrets}" --argjson misconfigs "${misconfigs}" --argjson trivy_rc "${trivy_rc}" \
'{status:$status, compliant:$compliant, category:"artifact_security", scan_type:"filesystem", scanner:"trivy", critical_vulnerabilities:$critical, high_vulnerabilities:$high, secrets:$secrets, high_or_critical_misconfigurations:$misconfigs, trivy_rc:$trivy_rc, high_vulnerability_policy:"observe"}' > build/ironbank-compliance.json
'''
}
container('publisher') { container('publisher') {
sh ''' sh '''
set -eu set -eu
@ -241,25 +152,13 @@ PY
failed_runs="$(awk -F= '$1=="failed"{print $2}' build/quality-gate.state 2>/dev/null | tail -n1)" failed_runs="$(awk -F= '$1=="failed"{print $2}' build/quality-gate.state 2>/dev/null | tail -n1)"
[ -n "${ok_runs}" ] || ok_runs=0 [ -n "${ok_runs}" ] || ok_runs=0
[ -n "${failed_runs}" ] || failed_runs=0 [ -n "${failed_runs}" ] || failed_runs=0
coverage_percent="$(python3 - <<'PY'
import re
from pathlib import Path
log_path = Path("build/quality-gate.out")
text = log_path.read_text(encoding="utf-8", errors="ignore") if log_path.exists() else ""
values = [float(match.group(1)) for match in re.finditer(r"([0-9]+(?:\\.[0-9]+)?)%", text)]
print(values[-1] if values else 0.0)
PY
)"
printf '%s\n' "${coverage_percent}" > build/coverage-percent.txt
python3 scripts/publish_quality_metrics.py \ python3 scripts/publish_quality_metrics.py \
--pushgateway-url "${PUSHGATEWAY_URL}" \ --pushgateway-url "${PUSHGATEWAY_URL}" \
--job-name platform-quality-ci \ --job-name platform-quality-ci \
--suite "${SUITE_NAME}" \ --suite "${SUITE_NAME}" \
--trigger jenkins \ --trigger jenkins \
--local-ok "${ok_runs}" \ --local-ok "${ok_runs}" \
--local-failed "${failed_runs}" \ --local-failed "${failed_runs}"
--coverage-percent-file build/coverage-percent.txt
''' '''
} }
} }
@ -270,95 +169,7 @@ PY
container('publisher') { container('publisher') {
sh ''' sh '''
set -eu set -eu
gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)" test "$(cat build/quality-gate.rc 2>/dev/null || echo 1)" -eq 0
fail=0
if [ "${gate_rc}" -ne 0 ]; then
echo "quality gate failed with rc=${gate_rc}" >&2
fail=1
fi
enabled() {
case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
1|true|yes|on) return 0 ;;
*) return 1 ;;
esac
}
if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
sonar_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/sonarqube-quality-gate.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
print(status or "missing")
PY
)"
case "${sonar_status}" in
ok|pass|passed|success) ;;
*)
echo "sonarqube gate failed: ${sonar_status}" >&2
fail=1
;;
esac
fi
ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
ironbank_required=1
fi
if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
supply_status="$(python3 - <<'PY'
import json
from pathlib import Path
path = Path("build/ironbank-compliance.json")
if not path.exists():
print("missing")
raise SystemExit(0)
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception: # noqa: BLE001
print("error")
raise SystemExit(0)
compliant = payload.get("compliant")
if compliant is True:
print("ok")
elif compliant is False:
print("failed")
else:
status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
print(status or "missing")
PY
)"
case "${supply_status}" in
ok|pass|passed|success|compliant) ;;
not_applicable|na|n/a)
if enabled "${ironbank_required}"; then
echo "supply chain gate required but status=${supply_status}" >&2
fail=1
fi
;;
*)
if enabled "${ironbank_required}"; then
echo "supply chain gate failed: ${supply_status}" >&2
fail=1
else
echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
fi
;;
esac
fi
exit "${fail}"
''' '''
} }
} }
@ -367,7 +178,7 @@ PY
post { post {
always { always {
archiveArtifacts artifacts: 'build/*.json,build/*.out,build/*.rc,build/*.txt,build/*.xml', allowEmptyArchive: true, fingerprint: true archiveArtifacts artifacts: 'build/quality-gate.out,build/quality-gate.rc', allowEmptyArchive: true, fingerprint: true
} }
} }
} }

View File

@ -97,15 +97,10 @@ Primary config path:
Keep these fields accurate: Keep these fields accurate:
- `expected_flux_source_url` - `expected_flux_source_url`
- `expected_flux_branch` - `expected_flux_branch`
- `startup.service_checklist_explicit_only`
- `startup.service_checklist` - `startup.service_checklist`
- `startup.critical_service_endpoints` - `startup.critical_service_endpoints`
- `startup.require_ingress_checklist` - `startup.require_ingress_checklist`
- `startup.require_node_inventory_reachability` - `startup.require_node_inventory_reachability`
- `startup.node_inventory_reachability_required_nodes`
- `startup.node_ssh_auth_required_nodes`
- `startup.flux_health_required_kustomizations`
- `startup.workload_convergence_required_namespaces`
- `startup.ignore_unavailable_nodes` - `startup.ignore_unavailable_nodes`
- `coordination.role` - `coordination.role`
- `coordination.peer_hosts` - `coordination.peer_hosts`
@ -139,10 +134,9 @@ Installer behavior:
When adding nodes or services: When adding nodes or services:
1. Update inventory and node mapping in config. 1. Update inventory and node mapping in config.
2. Keep the explicit service checklist focused on the core services that must come back during an outage. 2. Add/adjust service checklist entries for anything user-facing or critical.
3. Keep `*_required_*` startup scopes aligned with the same core set so optional stacks do not block bootstrap. 3. Add/adjust ingress expectations for exposed services.
4. Add/adjust ingress expectations for exposed services. 4. Use temporary ignores only when truly intentional, then remove them.
5. Use temporary ignores only when truly intentional, then remove them. 5. Run `scripts/quality_gate.sh` before host deployment.
6. Run `scripts/quality_gate.sh` before host deployment.
Recovery quality should improve over time: every drill should reduce manual work in the next drill. Recovery quality should improve over time: every drill should reduce manual work in the next drill.

View File

@ -51,7 +51,6 @@ startup:
require_node_inventory_reachability: true require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300 node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5 node_inventory_reachability_poll_seconds: 5
node_inventory_reachability_required_nodes: []
required_node_labels: required_node_labels:
titan-09: titan-09:
ananke.bstein.dev/harbor-bootstrap: "true" ananke.bstein.dev/harbor-bootstrap: "true"
@ -91,7 +90,6 @@ startup:
admin_secret_name: keycloak-admin admin_secret_name: keycloak-admin
admin_secret_username_key: username admin_secret_username_key: username
admin_secret_password_key: password admin_secret_password_key: password
service_checklist_explicit_only: false
service_checklist: service_checklist:
- name: gitea-api - name: gitea-api
url: https://scm.bstein.dev/api/healthz url: https://scm.bstein.dev/api/healthz
@ -136,26 +134,18 @@ startup:
require_node_ssh_auth: true require_node_ssh_auth: true
node_ssh_auth_wait_seconds: 240 node_ssh_auth_wait_seconds: 240
node_ssh_auth_poll_seconds: 5 node_ssh_auth_poll_seconds: 5
node_ssh_auth_required_nodes: []
require_flux_health: true require_flux_health: true
flux_health_wait_seconds: 900 flux_health_wait_seconds: 900
flux_health_poll_seconds: 5 flux_health_poll_seconds: 5
flux_health_required_kustomizations: []
ignore_flux_kustomizations: [] ignore_flux_kustomizations: []
require_workload_convergence: true require_workload_convergence: true
workload_convergence_wait_seconds: 900 workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5 workload_convergence_poll_seconds: 5
workload_convergence_required_namespaces: []
ignore_workload_namespaces: [] ignore_workload_namespaces: []
ignore_workloads: [] ignore_workloads: []
ignore_unavailable_nodes: [] ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true auto_recycle_stuck_pods: true
auto_quarantine_scheduling_storms: false
scheduling_storm_event_threshold: 30
scheduling_storm_window_seconds: 180
stuck_pod_grace_seconds: 180 stuck_pod_grace_seconds: 180
post_start_auto_heal_seconds: 60
dead_node_cleanup_grace_seconds: 300
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: "" vault_unseal_breakglass_command: ""
vault_unseal_breakglass_timeout_seconds: 15 vault_unseal_breakglass_timeout_seconds: 15
@ -180,7 +170,6 @@ ups:
target: pyrphoros@localhost target: pyrphoros@localhost
poll_seconds: 5 poll_seconds: 5
runtime_safety_factor: 1.25 runtime_safety_factor: 1.25
on_battery_grace_seconds: 90
debounce_count: 3 debounce_count: 3
telemetry_timeout_seconds: 90 telemetry_timeout_seconds: 90
coordination: coordination:

View File

@ -117,52 +117,8 @@ startup:
require_node_inventory_reachability: true require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300 node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5 node_inventory_reachability_poll_seconds: 5
node_inventory_reachability_required_nodes:
- titan-0a
- titan-0b
- titan-0c
required_node_labels: required_node_labels:
titan-04:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-05:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-06:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-07:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-08:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-11:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-12:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-13:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-14:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-15:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-17:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-18:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-19:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-09: titan-09:
node-role.kubernetes.io/worker: "true"
ananke.bstein.dev/harbor-bootstrap: "true" ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true require_time_sync: true
time_sync_wait_seconds: 240 time_sync_wait_seconds: 240
@ -200,7 +156,6 @@ startup:
admin_secret_name: keycloak-admin admin_secret_name: keycloak-admin
admin_secret_username_key: username admin_secret_username_key: username
admin_secret_password_key: password admin_secret_password_key: password
service_checklist_explicit_only: true
service_checklist: service_checklist:
- name: gitea-api - name: gitea-api
url: https://scm.bstein.dev/api/healthz url: https://scm.bstein.dev/api/healthz
@ -245,49 +200,18 @@ startup:
require_node_ssh_auth: true require_node_ssh_auth: true
node_ssh_auth_wait_seconds: 240 node_ssh_auth_wait_seconds: 240
node_ssh_auth_poll_seconds: 5 node_ssh_auth_poll_seconds: 5
node_ssh_auth_required_nodes:
- titan-0a
- titan-0b
- titan-0c
require_flux_health: true require_flux_health: true
flux_health_wait_seconds: 900 flux_health_wait_seconds: 900
flux_health_poll_seconds: 5 flux_health_poll_seconds: 5
flux_health_required_kustomizations:
- flux-system/core
- flux-system/helm
- flux-system/traefik
- flux-system/cert-manager
- flux-system/longhorn
- flux-system/vault-csi
- flux-system/vault-injector
- flux-system/postgres
- flux-system/vault
- flux-system/keycloak
- flux-system/oauth2-proxy
- flux-system/gitea
- flux-system/monitoring
- flux-system/harbor
ignore_flux_kustomizations: [] ignore_flux_kustomizations: []
require_workload_convergence: true require_workload_convergence: true
workload_convergence_wait_seconds: 900 workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5 workload_convergence_poll_seconds: 5
workload_convergence_required_namespaces:
- vault
- postgres
- sso
- gitea
- monitoring
- harbor
ignore_workload_namespaces: [] ignore_workload_namespaces: []
ignore_workloads: [] ignore_workloads: []
ignore_unavailable_nodes: [] ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true auto_recycle_stuck_pods: true
auto_quarantine_scheduling_storms: true
scheduling_storm_event_threshold: 30
scheduling_storm_window_seconds: 180
stuck_pod_grace_seconds: 180 stuck_pod_grace_seconds: 180
post_start_auto_heal_seconds: 60
dead_node_cleanup_grace_seconds: 300
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'" vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15 vault_unseal_breakglass_timeout_seconds: 15
@ -311,7 +235,6 @@ ups:
target: statera@localhost target: statera@localhost
poll_seconds: 5 poll_seconds: 5
runtime_safety_factor: 1.25 runtime_safety_factor: 1.25
on_battery_grace_seconds: 90
debounce_count: 3 debounce_count: 3
telemetry_timeout_seconds: 90 telemetry_timeout_seconds: 90
coordination: coordination:

View File

@ -117,52 +117,8 @@ startup:
require_node_inventory_reachability: true require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300 node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5 node_inventory_reachability_poll_seconds: 5
node_inventory_reachability_required_nodes:
- titan-0a
- titan-0b
- titan-0c
required_node_labels: required_node_labels:
titan-04:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-05:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-06:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-07:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-08:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-11:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-12:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-13:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-14:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-15:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-17:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-18:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-19:
node-role.kubernetes.io/worker: "true"
longhorn-host: "true"
titan-09: titan-09:
node-role.kubernetes.io/worker: "true"
ananke.bstein.dev/harbor-bootstrap: "true" ananke.bstein.dev/harbor-bootstrap: "true"
require_time_sync: true require_time_sync: true
time_sync_wait_seconds: 240 time_sync_wait_seconds: 240
@ -200,7 +156,6 @@ startup:
admin_secret_name: keycloak-admin admin_secret_name: keycloak-admin
admin_secret_username_key: username admin_secret_username_key: username
admin_secret_password_key: password admin_secret_password_key: password
service_checklist_explicit_only: true
service_checklist: service_checklist:
- name: gitea-api - name: gitea-api
url: https://scm.bstein.dev/api/healthz url: https://scm.bstein.dev/api/healthz
@ -245,49 +200,18 @@ startup:
require_node_ssh_auth: true require_node_ssh_auth: true
node_ssh_auth_wait_seconds: 240 node_ssh_auth_wait_seconds: 240
node_ssh_auth_poll_seconds: 5 node_ssh_auth_poll_seconds: 5
node_ssh_auth_required_nodes:
- titan-0a
- titan-0b
- titan-0c
require_flux_health: true require_flux_health: true
flux_health_wait_seconds: 900 flux_health_wait_seconds: 900
flux_health_poll_seconds: 5 flux_health_poll_seconds: 5
flux_health_required_kustomizations:
- flux-system/core
- flux-system/helm
- flux-system/traefik
- flux-system/cert-manager
- flux-system/longhorn
- flux-system/vault-csi
- flux-system/vault-injector
- flux-system/postgres
- flux-system/vault
- flux-system/keycloak
- flux-system/oauth2-proxy
- flux-system/gitea
- flux-system/monitoring
- flux-system/harbor
ignore_flux_kustomizations: [] ignore_flux_kustomizations: []
require_workload_convergence: true require_workload_convergence: true
workload_convergence_wait_seconds: 900 workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5 workload_convergence_poll_seconds: 5
workload_convergence_required_namespaces:
- vault
- postgres
- sso
- gitea
- monitoring
- harbor
ignore_workload_namespaces: [] ignore_workload_namespaces: []
ignore_workloads: [] ignore_workloads: []
ignore_unavailable_nodes: [] ignore_unavailable_nodes: []
auto_recycle_stuck_pods: true auto_recycle_stuck_pods: true
auto_quarantine_scheduling_storms: true
scheduling_storm_event_threshold: 30
scheduling_storm_window_seconds: 180
stuck_pod_grace_seconds: 180 stuck_pod_grace_seconds: 180
post_start_auto_heal_seconds: 60
dead_node_cleanup_grace_seconds: 300
vault_unseal_key_file: /var/lib/ananke/vault-unseal.key vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'" vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
vault_unseal_breakglass_timeout_seconds: 15 vault_unseal_breakglass_timeout_seconds: 15
@ -311,7 +235,6 @@ ups:
target: pyrphoros@localhost target: pyrphoros@localhost
poll_seconds: 5 poll_seconds: 5
runtime_safety_factor: 1.25 runtime_safety_factor: 1.25
on_battery_grace_seconds: 90
debounce_count: 3 debounce_count: 3
telemetry_timeout_seconds: 90 telemetry_timeout_seconds: 90
coordination: coordination:

View File

@ -77,7 +77,7 @@ func (o *Orchestrator) waitForNodeSSHAuth(ctx context.Context, nodes []string) e
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
seen := map[string]struct{}{} seen := map[string]struct{}{}
targets := make([]string, 0, len(nodes)) targets := make([]string, 0, len(nodes))
for _, node := range startupRequiredNodes(nodes, o.cfg.Startup.NodeSSHAuthRequiredNodes) { for _, node := range nodes {
node = strings.TrimSpace(node) node = strings.TrimSpace(node)
if node == "" { if node == "" {
continue continue

View File

@ -1,288 +0,0 @@
package cluster
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
)
type nodeReadyList struct {
Items []struct {
Metadata struct {
Name string `json:"name"`
} `json:"metadata"`
Status struct {
Conditions []struct {
Type string `json:"type"`
Status string `json:"status"`
} `json:"conditions"`
} `json:"status"`
} `json:"items"`
}
type podDeleteList struct {
Items []struct {
Metadata struct {
Namespace string `json:"namespace"`
Name string `json:"name"`
DeletionTimestamp *time.Time `json:"deletionTimestamp"`
} `json:"metadata"`
Spec struct {
NodeName string `json:"nodeName"`
} `json:"spec"`
} `json:"items"`
}
// RunPostStartAutoHeal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error.
// Why: gives the long-running daemon a narrow, testable repair entrypoint for
// post-start drift without rerunning the full startup flow.
func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error {
return o.postStartAutoHeal(ctx)
}
// postStartAutoHeal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error.
// Why: centralizes bounded post-start repair actions so recurring outage
// patterns only trigger the specific remediation they need.
func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error {
if o.runner.DryRun {
return nil
}
errs := []string{}
requestReconcile := false
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
errs = append(errs, fmt.Sprintf("required node labels: %v", err))
}
vaultRecovered, err := o.autoRecoverSealedVault(ctx)
if err != nil {
errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err))
} else if vaultRecovered {
requestReconcile = true
if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil {
errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err))
}
}
cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx)
if err != nil {
errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err))
} else if cleaned > 0 {
requestReconcile = true
}
if requestReconcile {
o.bestEffort("request flux reconcile after post-start auto-heal", func() error {
return o.requestFluxReconcile(ctx)
})
}
if len(errs) > 0 {
return errors.New(strings.Join(errs, "; "))
}
return nil
}
// autoRecoverSealedVault runs one orchestration or CLI step.
// Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error).
// Why: lets the daemon repair a later Vault reseal without waiting for a new
// bootstrap run.
func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) {
if o.runner.DryRun {
return false, nil
}
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
if err != nil {
if isNotFoundErr(err) {
return false, nil
}
return false, fmt.Errorf("vault pod phase check failed: %w", err)
}
if strings.TrimSpace(phase) != "Running" {
return false, nil
}
sealed, err := o.vaultSealed(ctx)
if err != nil {
return false, err
}
if !sealed {
return false, nil
}
o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery")
if err := o.ensureVaultUnsealed(ctx); err != nil {
return false, err
}
return true, nil
}
// rerunVaultK8sAuthConfigJob runs one orchestration or CLI step.
// Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error.
// Why: post-unseal Vault recovery needs the auth-config job retriggered so
// downstream secret consumers stop carrying stale failures from the sealed window.
func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error {
if o.runner.DryRun {
return nil
}
jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix())
if _, err := o.kubectl(
ctx,
25*time.Second,
"-n", "vault",
"create", "job",
"--from=cronjob/vault-k8s-auth-config",
jobName,
); err != nil {
return fmt.Errorf("create job %s: %w", jobName, err)
}
o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName)
return nil
}
// cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step.
// Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error).
// Why: dead nodes can strand terminating pods indefinitely, so the daemon should
// clear only that narrow failure class instead of leaving garbage behind forever.
func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) {
if o.runner.DryRun {
return 0, nil
}
unavailable, err := o.unavailableNodeSet(ctx)
if err != nil {
return 0, err
}
if len(unavailable) == 0 {
return 0, nil
}
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
if err != nil {
return 0, fmt.Errorf("query pods: %w", err)
}
var pods podDeleteList
if err := json.Unmarshal([]byte(out), &pods); err != nil {
return 0, fmt.Errorf("decode pods: %w", err)
}
grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second
now := time.Now()
count := 0
for _, item := range pods.Items {
if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" {
continue
}
if _, badNode := unavailable[item.Spec.NodeName]; !badNode {
continue
}
if now.Sub(*item.Metadata.DeletionTimestamp) < grace {
continue
}
o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName)
if _, err := o.kubectl(
ctx,
20*time.Second,
"-n", item.Metadata.Namespace,
"delete", "pod", item.Metadata.Name,
"--grace-period=0",
"--force",
"--wait=false",
); err != nil && !isNotFoundErr(err) {
return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err)
}
count++
}
if count > 0 {
o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count)
}
return count, nil
}
// unavailableNodeSet runs one orchestration or CLI step.
// Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error).
// Why: isolates Ready-condition parsing so dead-node cleanup stays targeted.
func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) {
out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
if err != nil {
return nil, fmt.Errorf("query nodes: %w", err)
}
var nodes nodeReadyList
if err := json.Unmarshal([]byte(out), &nodes); err != nil {
return nil, fmt.Errorf("decode nodes: %w", err)
}
unavailable := map[string]struct{}{}
for _, item := range nodes.Items {
ready := ""
for _, cond := range item.Status.Conditions {
if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
ready = strings.TrimSpace(cond.Status)
break
}
}
if ready != "True" {
unavailable[item.Metadata.Name] = struct{}{}
}
}
return unavailable, nil
}
// requestFluxReconcile runs one orchestration or CLI step.
// Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error.
// Why: post-start repairs need a lightweight way to refresh GitOps health
// without reusing the broader startup flux-resume flow.
func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error {
if o.runner.DryRun {
return nil
}
now := time.Now().UTC().Format(time.RFC3339)
if _, err := o.kubectl(
ctx,
25*time.Second,
"-n", "flux-system",
"annotate", "gitrepository", "flux-system",
"reconcile.fluxcd.io/requestedAt="+now,
"--overwrite",
); err != nil {
return fmt.Errorf("annotate flux source reconcile: %w", err)
}
if _, err := o.kubectl(
ctx,
25*time.Second,
"-n", "flux-system",
"annotate",
"kustomizations.kustomize.toolkit.fluxcd.io",
"--all",
"reconcile.fluxcd.io/requestedAt="+now,
"--overwrite",
); err != nil {
return fmt.Errorf("annotate flux kustomizations reconcile: %w", err)
}
if _, err := o.kubectl(
ctx,
25*time.Second,
"annotate",
"--all-namespaces",
"helmreleases.helm.toolkit.fluxcd.io",
"--all",
"reconcile.fluxcd.io/requestedAt="+now,
"--overwrite",
); err != nil {
o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err)
}
if o.runOverride == nil && o.runner.CommandExists("flux") {
if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil {
o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err)
}
}
return nil
}

View File

@ -1,296 +0,0 @@
package cluster
import (
"context"
"errors"
"io"
"log"
"os"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step.
// Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T).
// Why: cleanup on dead nodes must be selective so Ananke only force-deletes the
// truly stranded pods and tolerates already-gone objects.
func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
if err != nil || count != 0 {
t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err)
}
})
t.Run("selective cleanup tolerates not found", func(t *testing.T) {
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339)
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
},
{
match: matchContains("kubectl", "get pods -A -o json"),
out: `{"items":[` +
`{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` +
`{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` +
`{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` +
`{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`,
},
{
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
err: errors.New("pod old-stale not found"),
},
})
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
if err != nil {
t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err)
}
if count != 1 {
t.Fatalf("expected one cleaned pod, got %d", count)
}
})
t.Run("query and decode errors surface", func(t *testing.T) {
queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
err: errors.New("nodes failed"),
},
})
if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") {
t.Fatalf("expected node query error, got %v", err)
}
decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`,
},
{
match: matchContains("kubectl", "get pods -A -o json"),
out: `{bad json`,
},
})
if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") {
t.Fatalf("expected pod decode error, got %v", err)
}
})
t.Run("delete hard error surfaces", func(t *testing.T) {
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`,
},
{
match: matchContains("kubectl", "get pods -A -o json"),
out: `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`,
},
{
match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
err: errors.New("delete failed"),
},
})
count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") {
t.Fatalf("expected delete failure, got count=%d err=%v", count, err)
}
})
}
// TestUnavailableNodeSetBranches runs one orchestration or CLI step.
// Signature: TestUnavailableNodeSetBranches(t *testing.T).
// Why: node Ready parsing drives dead-node cleanup, so malformed and missing
// Ready condition payloads need direct coverage too.
func TestUnavailableNodeSetBranches(t *testing.T) {
t.Run("decode error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
})
if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") {
t.Fatalf("expected decode error, got %v", err)
}
})
t.Run("missing ready condition counts as unavailable", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "get nodes -o json"),
out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
},
})
nodes, err := orch.unavailableNodeSet(context.Background())
if err != nil {
t.Fatalf("unavailableNodeSet failed: %v", err)
}
if _, ok := nodes["titan-22"]; !ok {
t.Fatalf("expected titan-22 to be treated as unavailable")
}
if _, ok := nodes["titan-07"]; ok {
t.Fatalf("did not expect titan-07 to be treated as unavailable")
}
})
}
// TestRequestFluxReconcileBranches runs one orchestration or CLI step.
// Signature: TestRequestFluxReconcileBranches(t *testing.T).
// Why: the post-start repair loop needs predictable Flux refresh behavior even
// when one annotation call is flaky.
func TestRequestFluxReconcileBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
if err := orch.requestFluxReconcile(context.Background()); err != nil {
t.Fatalf("dry-run requestFluxReconcile failed: %v", err)
}
})
t.Run("git source annotate error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
err: errors.New("annotate failed"),
},
})
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") {
t.Fatalf("expected gitrepository annotate error, got %v", err)
}
})
t.Run("kustomization annotate error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
out: "",
},
{
match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="),
err: errors.New("annotate failed"),
},
})
if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") {
t.Fatalf("expected kustomization annotate error, got %v", err)
}
})
t.Run("helm annotate warning and flux command path", func(t *testing.T) {
tmpDir := t.TempDir()
callLog := filepath.Join(tmpDir, "calls.log")
kubectlPath := filepath.Join(tmpDir, "kubectl")
fluxPath := filepath.Join(tmpDir, "flux")
kubectlScript := "#!/bin/sh\n" +
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"case \"$*\" in\n" +
" *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" +
"esac\n" +
"exit 0\n"
fluxScript := "#!/bin/sh\n" +
"printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"exit 0\n"
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
t.Fatalf("write fake kubectl: %v", err)
}
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
t.Fatalf("write fake flux: %v", err)
}
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
cfg := config.Config{
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(cfg.State.RunHistoryPath),
log: log.New(io.Discard, "", 0),
}
if err := orch.requestFluxReconcile(context.Background()); err != nil {
t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err)
}
calls, err := os.ReadFile(callLog)
if err != nil {
t.Fatalf("read fake command log: %v", err)
}
logText := string(calls)
if !strings.Contains(logText, "annotate gitrepository flux-system") {
t.Fatalf("expected gitrepository annotate call, got %q", logText)
}
if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") {
t.Fatalf("expected kustomization annotate call, got %q", logText)
}
if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") {
t.Fatalf("expected flux reconcile command, got %q", logText)
}
})
t.Run("flux command failure is tolerated", func(t *testing.T) {
tmpDir := t.TempDir()
callLog := filepath.Join(tmpDir, "calls.log")
kubectlPath := filepath.Join(tmpDir, "kubectl")
fluxPath := filepath.Join(tmpDir, "flux")
kubectlScript := "#!/bin/sh\n" +
"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"exit 0\n"
fluxScript := "#!/bin/sh\n" +
"printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
"exit 1\n"
if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
t.Fatalf("write fake kubectl: %v", err)
}
if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
t.Fatalf("write fake flux: %v", err)
}
t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
cfg := config.Config{
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(cfg.State.RunHistoryPath),
log: log.New(io.Discard, "", 0),
}
if err := orch.requestFluxReconcile(context.Background()); err != nil {
t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err)
}
calls, err := os.ReadFile(callLog)
if err != nil {
t.Fatalf("read fake command log: %v", err)
}
if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") {
t.Fatalf("expected failing flux command to be attempted, got %q", string(calls))
}
})
}

View File

@ -1,382 +0,0 @@
package cluster
import (
"context"
"encoding/base64"
"errors"
"io"
"log"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestPostStartAutoHealRepairsVaultAndUnavailableNodes runs one orchestration or CLI step.
// Signature: TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T).
// Why: covers the new daemon-triggered repair path for late Vault reseals and
// stale terminating pods anchored to unavailable nodes.
func TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
DeadNodeCleanupGraceSeconds: 300,
RequiredNodeLabels: map[string]map[string]string{
"titan-07": {"node-role.kubernetes.io/worker": "true"},
},
},
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
log: log.New(io.Discard, "", 0),
}
oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
unsealCalls := 0
jobCreated := false
reconciled := false
deleted := map[string]bool{}
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
if name != "kubectl" {
return "", nil
}
joined := strings.Join(args, " ")
switch {
case strings.Contains(joined, "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"):
return "", nil
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
if unsealCalls == 0 {
return `{"initialized":true,"sealed":true}`, nil
}
return `{"initialized":true,"sealed":false}`, nil
case strings.Contains(joined, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"):
return base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), nil
case strings.Contains(joined, "vault operator unseal"):
unsealCalls++
return "", nil
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
jobCreated = true
return "", nil
case strings.Contains(joined, "get nodes -o json"):
return `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
case strings.Contains(joined, "get pods -A -o json"):
return `{"items":[{"metadata":{"namespace":"maintenance","name":"stale-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},{"metadata":{"namespace":"logging","name":"healthy-node-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}}]}`, nil
case strings.Contains(joined, "-n maintenance delete pod stale-pod --grace-period=0 --force --wait=false"):
deleted["maintenance/stale-pod"] = true
return "", nil
case strings.Contains(joined, "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="):
reconciled = true
return "", nil
case strings.Contains(joined, "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
return "", nil
case strings.Contains(joined, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
return "", nil
default:
return "", nil
}
}
orch.SetCommandOverrides(dispatch, dispatch)
if err := orch.postStartAutoHeal(context.Background()); err != nil {
t.Fatalf("postStartAutoHeal failed: %v", err)
}
if unsealCalls != 1 {
t.Fatalf("expected one Vault unseal attempt, got %d", unsealCalls)
}
if !jobCreated {
t.Fatalf("expected vault k8s auth config job to be created")
}
if !deleted["maintenance/stale-pod"] {
t.Fatalf("expected stale unavailable-node pod to be deleted")
}
if !reconciled {
t.Fatalf("expected flux reconcile request after repairs")
}
if deleted["logging/healthy-node-pod"] {
t.Fatalf("did not expect terminating pod on healthy node to be deleted")
}
}
// TestPostStartAutoHealSkipsWhenClusterIsHealthy runs one orchestration or CLI step.
// Signature: TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T).
// Why: proves the new post-start repair loop stays quiet when the specific
// failure patterns are absent.
func TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
DeadNodeCleanupGraceSeconds: 300,
},
State: config.State{
Dir: t.TempDir(),
ReportsDir: filepath.Join(t.TempDir(), "reports"),
RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
},
}
orch := &Orchestrator{
cfg: cfg,
runner: &execx.Runner{},
store: state.New(filepath.Join(t.TempDir(), "runs.json")),
log: log.New(io.Discard, "", 0),
}
unsealCalls := 0
jobCreated := false
reconciled := false
dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
if name != "kubectl" {
return "", nil
}
joined := strings.Join(args, " ")
switch {
case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
return `{"initialized":true,"sealed":false}`, nil
case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
jobCreated = true
return "", nil
case strings.Contains(joined, "vault operator unseal"):
unsealCalls++
return "", nil
case strings.Contains(joined, "get nodes -o json"):
return `{"items":[{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
case strings.Contains(joined, "get pods -A -o json"):
return `{"items":[]}`, nil
case strings.Contains(joined, "reconcile.fluxcd.io/requestedAt="):
reconciled = true
return "", nil
default:
return "", nil
}
}
orch.SetCommandOverrides(dispatch, dispatch)
if err := orch.postStartAutoHeal(context.Background()); err != nil {
t.Fatalf("postStartAutoHeal failed: %v", err)
}
if unsealCalls != 0 {
t.Fatalf("did not expect Vault unseal calls, got %d", unsealCalls)
}
if jobCreated {
t.Fatalf("did not expect vault auth config job creation")
}
if reconciled {
t.Fatalf("did not expect flux reconcile request for healthy cluster")
}
}
// TestRunPostStartAutoHealDryRun runs one orchestration or CLI step.
// Signature: TestRunPostStartAutoHealDryRun(t *testing.T).
// Why: covers the exported wrapper and the top-level dry-run guard so daemon
// auto-heal never mutates cluster state during rehearsal runs.
func TestRunPostStartAutoHealDryRun(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
if err := orch.RunPostStartAutoHeal(context.Background()); err != nil {
t.Fatalf("RunPostStartAutoHeal dry-run failed: %v", err)
}
}
// TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step.
// Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T).
// Why: proves the daemon reports each failed sub-repair together instead of
// hiding later failures behind the first problem.
func TestPostStartAutoHealAggregatesErrors(t *testing.T) {
cfg := config.Config{
Startup: config.Startup{
DeadNodeCleanupGraceSeconds: 300,
RequiredNodeLabels: map[string]map[string]string{
"titan-07": {"node-role.kubernetes.io/worker": "true"},
},
},
}
orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
{
match: matchContains("kubectl", "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"),
err: errors.New("label failed"),
},
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
err: errors.New("vault phase failed"),
},
{
match: matchContains("kubectl", "get nodes -o json"),
err: errors.New("node query failed"),
},
})
err := orch.postStartAutoHeal(context.Background())
if err == nil {
t.Fatalf("expected aggregated error")
}
msg := err.Error()
for _, want := range []string{
"required node labels:",
"vault auto-recovery:",
"dead-node terminating pod cleanup:",
} {
if !strings.Contains(msg, want) {
t.Fatalf("expected %q in %q", want, msg)
}
}
}
// TestAutoRecoverSealedVaultBranches runs one orchestration or CLI step.
// Signature: TestAutoRecoverSealedVaultBranches(t *testing.T).
// Why: late Vault reseals are a high-risk failure path, so the daemon needs
// coverage across the quiet-skip, parse-failure, and unseal-failure branches.
func TestAutoRecoverSealedVaultBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected dry-run skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("pod missing is quiet", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
err: errors.New("vault-0 not found"),
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected quiet skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("phase check error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
err: errors.New("phase check failed"),
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if recovered || err == nil || !strings.Contains(err.Error(), "vault pod phase check failed") {
t.Fatalf("expected phase check error, got recovered=%v err=%v", recovered, err)
}
})
t.Run("non-running pod defers", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Pending",
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected pending pod skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("status parse failure surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Running",
},
{
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
out: "garbage",
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if recovered || err == nil || !strings.Contains(err.Error(), "parse vault status") {
t.Fatalf("expected parse error, got recovered=%v err=%v", recovered, err)
}
})
t.Run("already unsealed stays quiet", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Running",
},
{
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
out: `{"sealed":false}`,
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if err != nil || recovered {
t.Fatalf("expected already-unsealed skip, got recovered=%v err=%v", recovered, err)
}
})
t.Run("unseal failure surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
out: "Running",
},
{
match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
out: `{"sealed":true}`,
},
{
match: matchContains("kubectl", "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"),
out: base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")),
},
{
match: matchContains("kubectl", "vault operator unseal"),
err: errors.New("exec boom"),
},
})
recovered, err := orch.autoRecoverSealedVault(context.Background())
if recovered || err == nil || !strings.Contains(err.Error(), "vault unseal attempt 1 failed") {
t.Fatalf("expected unseal failure, got recovered=%v err=%v", recovered, err)
}
})
}
// TestRerunVaultK8sAuthConfigJobBranches runs one orchestration or CLI step.
// Signature: TestRerunVaultK8sAuthConfigJobBranches(t *testing.T).
// Why: the post-unseal auth job is part of the production recovery chain, so
// dry-run and create-error behavior both need explicit coverage.
func TestRerunVaultK8sAuthConfigJobBranches(t *testing.T) {
t.Run("dry run skips", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
orch.runner.DryRun = true
if err := orch.rerunVaultK8sAuthConfigJob(context.Background()); err != nil {
t.Fatalf("dry-run rerunVaultK8sAuthConfigJob failed: %v", err)
}
})
t.Run("create error surfaces", func(t *testing.T) {
orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
{
match: matchContains("kubectl", "-n vault create job --from=cronjob/vault-k8s-auth-config"),
err: errors.New("create failed"),
},
})
err := orch.rerunVaultK8sAuthConfigJob(context.Background())
if err == nil || !strings.Contains(err.Error(), "create job vault-k8s-auth-config-autoheal-") {
t.Fatalf("expected create-job error, got %v", err)
}
})
}

View File

@ -227,31 +227,6 @@ func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) er
return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name) return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
} }
// ensureVaultUnsealedWhenRunnable runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error).
// Why: lets startup defer vault unseal until the pod is actually runnable, while
// keeping the direct unseal helper strict for explicit recovery paths and tests.
func (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error) {
if o.runner.DryRun {
return false, "", nil
}
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
if err != nil {
if isNotFoundErr(err) {
return true, "vault-0 pod is not present yet; deferring unseal until critical workload recovery", nil
}
return false, "", fmt.Errorf("vault pod phase check failed: %w", err)
}
trimmedPhase := strings.TrimSpace(phase)
if trimmedPhase != "Running" {
return true, fmt.Sprintf("vault-0 pod phase is %q; deferring unseal until critical workload recovery", trimmedPhase), nil
}
return false, "", o.ensureVaultUnsealed(ctx)
}
// ensureVaultUnsealed runs one orchestration or CLI step. // ensureVaultUnsealed runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error. // Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.

View File

@ -143,8 +143,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
return false, "", fmt.Errorf("decode flux kustomizations: %w", err) return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
} }
ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations) ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
required := o.startupRequiredFluxKustomizations()
requiredSeen := map[string]struct{}{}
notReady := []string{} notReady := []string{}
for _, ks := range list.Items { for _, ks := range list.Items {
ns := strings.TrimSpace(ks.Metadata.Namespace) ns := strings.TrimSpace(ks.Metadata.Namespace)
@ -156,12 +154,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
if ks.Spec.Suspend { if ks.Spec.Suspend {
continue continue
} }
if len(required) > 0 {
if _, ok := required[full]; !ok {
continue
}
requiredSeen[full] = struct{}{}
}
if _, ok := ignored[full]; ok { if _, ok := ignored[full]; ok {
continue continue
} }
@ -181,25 +173,10 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
} }
notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason)) notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
} }
if len(required) > 0 {
missing := []string{}
for full := range required {
if _, ok := requiredSeen[full]; !ok {
missing = append(missing, full+"(missing)")
}
}
if len(missing) > 0 {
sort.Strings(missing)
notReady = append(notReady, missing...)
}
}
if len(notReady) > 0 { if len(notReady) > 0 {
sort.Strings(notReady) sort.Strings(notReady)
return false, "not ready: " + joinLimited(notReady, 6), nil return false, "not ready: " + joinLimited(notReady, 6), nil
} }
if len(required) > 0 {
return true, fmt.Sprintf("required kustomizations ready=%d", len(requiredSeen)), nil
}
return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
} }

View File

@ -19,7 +19,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 { if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
return nil return nil
} }
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels)) nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
for node := range o.cfg.Startup.RequiredNodeLabels { for node := range o.cfg.Startup.RequiredNodeLabels {
node = strings.TrimSpace(node) node = strings.TrimSpace(node)
@ -29,10 +28,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
} }
sort.Strings(nodes) sort.Strings(nodes)
for _, node := range nodes { for _, node := range nodes {
if _, skip := ignored[node]; skip {
o.log.Printf("skipping required node labels for ignored unavailable node %s", node)
continue
}
labels := o.cfg.Startup.RequiredNodeLabels[node] labels := o.cfg.Startup.RequiredNodeLabels[node]
if len(labels) == 0 { if len(labels) == 0 {
continue continue
@ -60,11 +55,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
continue continue
} }
if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil { if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {
o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)
o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))
continue
}
return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err) return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
} }
o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", ")) o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))

View File

@ -37,7 +37,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
return invErr return invErr
} }
o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed") o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
o.maybeRunEarlyVaultUnseal(ctx)
o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory") o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil { if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error()) o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
@ -180,9 +179,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
} }
} }
o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable") o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
if err := o.runStartupVaultUnsealGate(ctx); err != nil {
return err
}
if err := o.ensureRequiredNodeLabels(ctx); err != nil { if err := o.ensureRequiredNodeLabels(ctx); err != nil {
return err return err
} }
@ -480,3 +476,18 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
o.log.Printf("shutdown flow complete") o.log.Printf("shutdown flow complete")
return nil return nil
} }
// normalizeShutdownMode runs one orchestration or CLI step.
// Signature: normalizeShutdownMode(raw string) (string, error).
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
// semantics while preserving compatibility with legacy "config" callers.
func normalizeShutdownMode(raw string) (string, error) {
switch strings.TrimSpace(raw) {
case "", "config", "cluster-only":
return "cluster-only", nil
case "poweroff":
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
default:
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
}
}

View File

@ -30,7 +30,7 @@ func (o *Orchestrator) waitForNodeInventoryReachability(ctx context.Context) err
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
targets := make([]string, 0, len(o.inventoryNodesForValidation())) targets := make([]string, 0, len(o.inventoryNodesForValidation()))
seen := map[string]struct{}{} seen := map[string]struct{}{}
for _, node := range startupRequiredNodes(o.inventoryNodesForValidation(), o.cfg.Startup.NodeInventoryReachRequiredNodes) { for _, node := range o.inventoryNodesForValidation() {
node = strings.TrimSpace(node) node = strings.TrimSpace(node)
if node == "" { if node == "" {
continue continue

View File

@ -1,261 +0,0 @@
package cluster
import (
"context"
"encoding/json"
"fmt"
"sort"
"strings"
"time"
)
// maybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
// Signature: (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
// Why: a non-core workload that cannot schedule can emit enough warning events to
// thrash the control plane datastore; quarantine keeps startup moving while
// preserving core services.
func (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
if o.runner.DryRun || !o.cfg.Startup.AutoQuarantineSchedulingStorms {
return
}
now := time.Now()
if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second {
return
}
if lastAttempt != nil {
*lastAttempt = now
}
o.bestEffort("quarantine non-core scheduling storm workloads", func() error {
return o.quarantineSchedulingStormWorkloads(ctx)
})
}
// quarantineSchedulingStormWorkloads runs one orchestration or CLI step.
// Signature: (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error.
// Why: limits startup-only mitigation to workloads proven to be generating a
// scheduling event storm, instead of scaling optional apps down blindly.
func (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error {
podsOut, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query pods for scheduling storm scan: %w", err)
}
var pods podList
if err := json.Unmarshal([]byte(podsOut), &pods); err != nil {
return fmt.Errorf("decode pods for scheduling storm scan: %w", err)
}
rsOut, err := o.kubectl(ctx, 30*time.Second, "get", "replicasets", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query replicasets for scheduling storm scan: %w", err)
}
var rsList replicaSetList
if err := json.Unmarshal([]byte(rsOut), &rsList); err != nil {
return fmt.Errorf("decode replicasets for scheduling storm scan: %w", err)
}
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query events for scheduling storm scan: %w", err)
}
var events eventList
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
return fmt.Errorf("decode events for scheduling storm scan: %w", err)
}
workloadsOut, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query workloads for scheduling storm scan: %w", err)
}
var workloads workloadList
if err := json.Unmarshal([]byte(workloadsOut), &workloads); err != nil {
return fmt.Errorf("decode workloads for scheduling storm scan: %w", err)
}
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
eventThreshold := o.cfg.Startup.SchedulingStormEventThreshold
if eventThreshold <= 0 {
eventThreshold = 30
}
window := time.Duration(o.cfg.Startup.SchedulingStormWindowSeconds) * time.Second
if window <= 0 {
window = 3 * time.Minute
}
podsByKey := map[string]podResource{}
for _, pod := range pods.Items {
ns := strings.TrimSpace(pod.Metadata.Namespace)
name := strings.TrimSpace(pod.Metadata.Name)
if ns == "" || name == "" {
continue
}
podsByKey[ns+"/"+name] = pod
}
rsOwners := map[string]ownerReference{}
for _, rs := range rsList.Items {
ns := strings.TrimSpace(rs.Metadata.Namespace)
name := strings.TrimSpace(rs.Metadata.Name)
if ns == "" || name == "" {
continue
}
for _, owner := range rs.Metadata.OwnerReferences {
kind := strings.TrimSpace(owner.Kind)
ownerName := strings.TrimSpace(owner.Name)
if kind == "" || ownerName == "" {
continue
}
rsOwners[ns+"/"+name] = owner
break
}
}
workloadDesired := map[string]int32{}
for _, item := range workloads.Items {
kind := strings.ToLower(strings.TrimSpace(item.Kind))
ns := strings.TrimSpace(item.Metadata.Namespace)
name := strings.TrimSpace(item.Metadata.Name)
if kind == "" || ns == "" || name == "" {
continue
}
desired, _, ok := desiredReady(item)
if !ok {
continue
}
workloadDesired[ns+"/"+kind+"/"+name] = desired
}
quarantined := []string{}
seen := map[string]struct{}{}
now := time.Now()
for _, event := range events.Items {
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
continue
}
if strings.TrimSpace(event.Reason) != "FailedScheduling" {
continue
}
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
continue
}
lastSeen := eventLastObservedAt(event)
if !lastSeen.IsZero() && now.Sub(lastSeen) > window {
continue
}
count := eventObservationCount(event)
if count < eventThreshold {
continue
}
podKey := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
pod, ok := podsByKey[podKey]
if !ok {
continue
}
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
continue
}
ns := strings.TrimSpace(pod.Metadata.Namespace)
if _, ok := requiredNamespaces[ns]; ok {
continue
}
if _, ok := ignoredNamespaces[ns]; ok {
continue
}
if workloadIgnored(ignoreRules, ns, "", strings.TrimSpace(pod.Metadata.Name)) {
continue
}
if podTargetsIgnoredNode(pod, ignoredNodes) {
continue
}
workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
if !ok {
continue
}
if workloadIgnored(ignoreRules, workload.Namespace, workload.Kind, workload.Name) {
continue
}
workloadKey := workload.Namespace + "/" + workload.Kind + "/" + workload.Name
if _, done := seen[workloadKey]; done {
continue
}
desired := workloadDesired[workloadKey]
if desired <= 0 {
continue
}
if err := o.ensureWorkloadReplicas(ctx, workload, 0); err != nil {
return fmt.Errorf("scale scheduling storm workload %s to 0: %w", workloadKey, err)
}
seen[workloadKey] = struct{}{}
quarantined = append(quarantined, fmt.Sprintf("%s events=%d window=%s", workloadKey, count, window))
}
if len(quarantined) == 0 {
return nil
}
sort.Strings(quarantined)
detail := fmt.Sprintf("quarantined scheduling storm workload(s): %s", joinLimited(quarantined, 8))
o.log.Printf("%s", detail)
o.noteStartupAutoHeal(detail)
return nil
}
// schedulingStormOwnerWorkload runs one orchestration or CLI step.
// Signature: schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool).
// Why: scheduling storms happen at the pod layer, but safe mitigation needs to
// operate on the owning deployment or statefulset.
func schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool) {
ns := strings.TrimSpace(pod.Metadata.Namespace)
for _, owner := range pod.Metadata.OwnerReferences {
switch strings.TrimSpace(owner.Kind) {
case "StatefulSet":
if name := strings.TrimSpace(owner.Name); name != "" {
return startupWorkload{Namespace: ns, Kind: "statefulset", Name: name}, true
}
case "ReplicaSet":
rsName := strings.TrimSpace(owner.Name)
if rsName == "" {
continue
}
rsOwner, ok := rsOwners[ns+"/"+rsName]
if !ok || strings.TrimSpace(rsOwner.Kind) != "Deployment" || strings.TrimSpace(rsOwner.Name) == "" {
continue
}
return startupWorkload{Namespace: ns, Kind: "deployment", Name: strings.TrimSpace(rsOwner.Name)}, true
}
}
return startupWorkload{}, false
}
// eventObservationCount runs one orchestration or CLI step.
// Signature: eventObservationCount(event eventResource) int.
// Why: event count can live either on the root event or in the series payload;
// using the max keeps detection stable across Kubernetes versions.
func eventObservationCount(event eventResource) int {
count := event.Count
if event.Series.Count > count {
count = event.Series.Count
}
if count < 1 {
return 1
}
return count
}
// eventLastObservedAt runs one orchestration or CLI step.
// Signature: eventLastObservedAt(event eventResource) time.Time.
// Why: event recency fields vary by cluster version; prefer the newest explicit
// observation time and fall back to creation time when needed.
func eventLastObservedAt(event eventResource) time.Time {
switch {
case !event.Series.LastObservedTime.IsZero():
return event.Series.LastObservedTime
case !event.LastTimestamp.IsZero():
return event.LastTimestamp
case !event.EventTime.IsZero():
return event.EventTime
default:
return event.Metadata.CreationTimestamp
}
}

View File

@ -1,21 +0,0 @@
package cluster
import (
"fmt"
"strings"
)
// normalizeShutdownMode runs one orchestration or CLI step.
// Signature: normalizeShutdownMode(raw string) (string, error).
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
// semantics while preserving compatibility with legacy "config" callers.
func normalizeShutdownMode(raw string) (string, error) {
switch strings.TrimSpace(raw) {
case "", "config", "cluster-only":
return "cluster-only", nil
case "poweroff":
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
default:
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
}
}

View File

@ -1,81 +0,0 @@
package cluster
import "strings"
// startupRequiredNodes runs one orchestration or CLI step.
// Signature: startupRequiredNodes(nodes []string, required []string) []string.
// Why: lets startup enforce a smaller core node set during outage recovery
// without losing the stricter all-nodes behavior when no override is configured.
func startupRequiredNodes(nodes []string, required []string) []string {
requiredSet := makeStringSet(required)
if len(requiredSet) == 0 {
return nodes
}
filtered := make([]string, 0, len(nodes))
for _, node := range nodes {
node = strings.TrimSpace(node)
if node == "" {
continue
}
if _, ok := requiredSet[node]; ok {
filtered = append(filtered, node)
}
}
return filtered
}
// startupNodeStrictlyRequired runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupNodeStrictlyRequired(node string) bool.
// Why: absent or broken non-core nodes should not block recovery-only actions
// like label reconciliation once the operator has narrowed startup to core nodes.
func (o *Orchestrator) startupNodeStrictlyRequired(node string) bool {
node = strings.TrimSpace(node)
if node == "" {
return false
}
if len(o.cfg.Startup.NodeInventoryReachRequiredNodes) == 0 && len(o.cfg.Startup.NodeSSHAuthRequiredNodes) == 0 {
return true
}
for _, controlPlane := range o.cfg.ControlPlanes {
if strings.TrimSpace(controlPlane) == node {
return true
}
}
if containsNode(o.cfg.Startup.NodeInventoryReachRequiredNodes, node) {
return true
}
return containsNode(o.cfg.Startup.NodeSSHAuthRequiredNodes, node)
}
// startupRequiredFluxKustomizations runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{}.
// Why: lets outage recovery wait on a declared core GitOps slice while leaving
// optional stacks free to converge after bootstrap succeeds.
func (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{} {
return makeStringSet(o.cfg.Startup.FluxHealthRequiredKustomizations)
}
// startupRequiredWorkloadNamespaces runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{}.
// Why: keeps workload readiness scoped to core namespaces during recovery while
// preserving broad convergence checks when no explicit core list is configured.
func (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{} {
return makeStringSet(o.cfg.Startup.WorkloadConvergenceRequiredNamespaces)
}
// containsNode runs one orchestration or CLI step.
// Signature: containsNode(entries []string, needle string) bool.
// Why: keeps node-scope checks small and explicit anywhere startup narrows its
// recovery gates to a declared core set.
func containsNode(entries []string, needle string) bool {
needle = strings.TrimSpace(needle)
if needle == "" {
return false
}
for _, entry := range entries {
if strings.TrimSpace(entry) == needle {
return true
}
}
return false
}

View File

@ -1,52 +0,0 @@
package cluster
import (
"context"
"fmt"
"time"
)
// maybeRunEarlyVaultUnseal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context).
// Why: gives startup a best-effort Vault recovery path when the API is already
// live, without consuming the hard startup failure path before workloads recover.
func (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context) {
if err := o.waitForAPI(ctx, 1, time.Second); err != nil {
return
}
o.noteStartupCheckState("vault-unseal-early", "running", "best-effort early vault unseal while kubernetes api is already available")
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
if err != nil {
o.log.Printf("warning: early vault unseal deferred: %v", err)
o.noteStartupAutoHeal(fmt.Sprintf("deferred early vault unseal: %v", err))
return
}
if deferred {
o.log.Printf("vault early unseal deferred: %s", detail)
o.noteStartupAutoHeal(detail)
return
}
o.noteStartupCheck("vault-unseal-early", true, "vault is already unsealed")
}
// runStartupVaultUnsealGate runs one orchestration or CLI step.
// Signature: (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error.
// Why: keeps the top-level startup flow readable while allowing Vault unseal to
// defer cleanly until critical workload recovery when the pod is not runnable yet.
func (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error {
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
if err != nil {
o.noteStartupCheck("vault-unseal", false, err.Error())
return err
}
if deferred {
o.log.Printf("vault unseal deferred until workload recovery: %s", detail)
o.noteStartupAutoHeal(detail)
o.noteStartupCheck("vault-unseal", true, detail)
return nil
}
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
return nil
}

View File

@ -177,46 +177,6 @@ type jobConditionRef struct {
Status string `json:"status"` Status string `json:"status"`
} }
type eventList struct {
Items []eventResource `json:"items"`
}
type eventResource struct {
Metadata struct {
Namespace string `json:"namespace"`
CreationTimestamp time.Time `json:"creationTimestamp"`
} `json:"metadata"`
InvolvedObject struct {
Kind string `json:"kind"`
Namespace string `json:"namespace"`
Name string `json:"name"`
} `json:"involvedObject"`
Type string `json:"type"`
Reason string `json:"reason"`
Message string `json:"message"`
Count int `json:"count"`
EventTime time.Time `json:"eventTime"`
LastTimestamp time.Time `json:"lastTimestamp"`
Series eventSeries `json:"series"`
}
type eventSeries struct {
Count int `json:"count"`
LastObservedTime time.Time `json:"lastObservedTime"`
}
type replicaSetList struct {
Items []replicaSetResource `json:"items"`
}
type replicaSetResource struct {
Metadata struct {
Namespace string `json:"namespace"`
Name string `json:"name"`
OwnerReferences []ownerReference `json:"ownerReferences"`
} `json:"metadata"`
}
type workloadResource struct { type workloadResource struct {
Kind string `json:"kind"` Kind string `json:"kind"`
Metadata struct { Metadata struct {
@ -261,7 +221,6 @@ type podResource struct {
type ownerReference struct { type ownerReference struct {
Kind string `json:"kind"` Kind string `json:"kind"`
Name string `json:"name"`
} }
type podContainerStatus struct { type podContainerStatus struct {

View File

@ -26,12 +26,10 @@ func (o *Orchestrator) waitForWorkloadConvergence(ctx context.Context) error {
lastLogged := time.Time{} lastLogged := time.Time{}
lastRecycleAttempt := time.Time{} lastRecycleAttempt := time.Time{}
lastReplicaHeal := time.Time{} lastReplicaHeal := time.Time{}
lastSchedulingStormHeal := time.Time{}
for { for {
prevFailure := lastFailure prevFailure := lastFailure
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal) o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
o.maybeAutoQuarantineSchedulingStorms(ctx, &lastSchedulingStormHeal)
ready, detail, err := o.workloadConvergenceReady(ctx) ready, detail, err := o.workloadConvergenceReady(ctx)
if err != nil { if err != nil {
lastFailure = err.Error() lastFailure = err.Error()
@ -73,7 +71,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
if err := json.Unmarshal([]byte(out), &list); err != nil { if err := json.Unmarshal([]byte(out), &list); err != nil {
return false, "", fmt.Errorf("decode controllers: %w", err) return false, "", fmt.Errorf("decode controllers: %w", err)
} }
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces) ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads) ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
@ -87,11 +84,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
if kind == "" || ns == "" || name == "" { if kind == "" || ns == "" || name == "" {
continue continue
} }
if len(requiredNamespaces) > 0 {
if _, ok := requiredNamespaces[ns]; !ok {
continue
}
}
if _, ok := ignoredNamespaces[ns]; ok { if _, ok := ignoredNamespaces[ns]; ok {
continue continue
} }

View File

@ -116,7 +116,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
return nil, fmt.Errorf("decode pods: %w", err) return nil, fmt.Errorf("decode pods: %w", err)
} }
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces) ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
stuckReasons := map[string]struct{}{ stuckReasons := map[string]struct{}{
@ -139,11 +138,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
if ns == "" || name == "" { if ns == "" || name == "" {
continue continue
} }
if len(requiredNamespaces) > 0 {
if _, ok := requiredNamespaces[ns]; !ok {
continue
}
}
if _, ok := ignoredNamespaces[ns]; ok { if _, ok := ignoredNamespaces[ns]; ok {
continue continue
} }

View File

@ -1,88 +0,0 @@
package cluster
import (
"context"
"fmt"
"strings"
"time"
)
// TestHookMaybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
// Why: exposes the scheduling-storm trigger guard to the split top-level test module.
func (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
o.maybeAutoQuarantineSchedulingStorms(ctx, lastAttempt)
}
// TestHookQuarantineSchedulingStormWorkloads runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error.
// Why: exposes the scheduling-storm auto-heal body to the split top-level test module.
func (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error {
return o.quarantineSchedulingStormWorkloads(ctx)
}
// TestHookSchedulingStormOwnerWorkload runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormOwnerWorkload(namespace string, ownerKind string, ownerName string, rsOwnerKind string, rsOwnerName string) (string, bool).
// Why: exposes owner-resolution behavior without leaking internal workload types.
func TestHookSchedulingStormOwnerWorkload(
namespace string,
ownerKind string,
ownerName string,
rsOwnerKind string,
rsOwnerName string,
) (string, bool) {
var pod podResource
pod.Metadata.Namespace = strings.TrimSpace(namespace)
pod.Metadata.OwnerReferences = []ownerReference{{
Kind: strings.TrimSpace(ownerKind),
Name: strings.TrimSpace(ownerName),
}}
rsOwners := map[string]ownerReference{}
if rsName := strings.TrimSpace(ownerName); rsName != "" && strings.TrimSpace(ownerKind) == "ReplicaSet" {
rsOwners[pod.Metadata.Namespace+"/"+rsName] = ownerReference{
Kind: strings.TrimSpace(rsOwnerKind),
Name: strings.TrimSpace(rsOwnerName),
}
}
workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
if !ok {
return "", false
}
return fmt.Sprintf("%s/%s/%s", workload.Namespace, workload.Kind, workload.Name), true
}
// TestHookEventObservationCount runs one orchestration or CLI step.
// Signature: TestHookEventObservationCount(count int, seriesCount int) int.
// Why: exposes event-count normalization used by scheduling-storm detection.
func TestHookEventObservationCount(count int, seriesCount int) int {
return eventObservationCount(eventResource{
Count: count,
Series: eventSeries{
Count: seriesCount,
},
})
}
// TestHookEventLastObservedAt runs one orchestration or CLI step.
// Signature: TestHookEventLastObservedAt(seriesLastObserved time.Time, lastTimestamp time.Time, eventTime time.Time, creationTimestamp time.Time) time.Time.
// Why: exposes event-time fallback behavior used by scheduling-storm detection.
func TestHookEventLastObservedAt(
seriesLastObserved time.Time,
lastTimestamp time.Time,
eventTime time.Time,
creationTimestamp time.Time,
) time.Time {
return eventLastObservedAt(eventResource{
LastTimestamp: lastTimestamp,
EventTime: eventTime,
Series: eventSeries{
LastObservedTime: seriesLastObserved,
},
Metadata: struct {
Namespace string `json:"namespace"`
CreationTimestamp time.Time `json:"creationTimestamp"`
}{
CreationTimestamp: creationTimestamp,
},
})
}

View File

@ -1,55 +0,0 @@
package cluster
import "context"
// TestHookStartupRequiredNodes runs one orchestration or CLI step.
// Signature: TestHookStartupRequiredNodes(nodes []string, required []string) []string.
// Why: exposes recovery-scope node filtering so top-level tests can cover core-only startup narrowing.
func TestHookStartupRequiredNodes(nodes []string, required []string) []string {
return startupRequiredNodes(nodes, required)
}
// TestHookContainsNode runs one orchestration or CLI step.
// Signature: TestHookContainsNode(entries []string, needle string) bool.
// Why: exposes the small startup-scope membership helper to top-level tests.
func TestHookContainsNode(entries []string, needle string) bool {
return containsNode(entries, needle)
}
// TestHookStartupNodeStrictlyRequired runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool.
// Why: exposes strict-node startup scoping so outage-recovery tests can confirm
// non-core nodes stop blocking bootstrap.
func (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool {
return o.startupNodeStrictlyRequired(node)
}
// TestHookStartupRequiredFluxKustomizations runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{}.
// Why: exposes flux startup scoping so top-level tests can confirm only core
// kustomizations block emergency bootstrap.
func (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{} {
return o.startupRequiredFluxKustomizations()
}
// TestHookStartupRequiredWorkloadNamespaces runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{}.
// Why: exposes workload namespace startup scoping so top-level tests can
// confirm only core workloads block emergency bootstrap.
func (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{} {
return o.startupRequiredWorkloadNamespaces()
}
// TestHookMaybeRunEarlyVaultUnseal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context).
// Why: exposes the early startup Vault deferral helper to top-level tests.
func (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context) {
o.maybeRunEarlyVaultUnseal(ctx)
}
// TestHookRunStartupVaultUnsealGate runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error.
// Why: exposes the startup Vault gate helper to top-level tests.
func (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error {
return o.runStartupVaultUnsealGate(ctx)
}

View File

@ -33,9 +33,6 @@ func (c *Config) applyDefaults() {
if c.Startup.NodeInventoryReachPollSeconds <= 0 { if c.Startup.NodeInventoryReachPollSeconds <= 0 {
c.Startup.NodeInventoryReachPollSeconds = 5 c.Startup.NodeInventoryReachPollSeconds = 5
} }
if c.Startup.NodeInventoryReachRequiredNodes == nil {
c.Startup.NodeInventoryReachRequiredNodes = []string{}
}
if c.Startup.RequiredNodeLabels == nil { if c.Startup.RequiredNodeLabels == nil {
c.Startup.RequiredNodeLabels = map[string]map[string]string{ c.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": { "titan-09": {
@ -124,11 +121,7 @@ func (c *Config) applyDefaults() {
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" { if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password" c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
} }
if c.Startup.ServiceChecklistExplicitOnly { c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, []ServiceChecklistCheck{})
} else {
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
}
for i := range c.Startup.ServiceChecklist { for i := range c.Startup.ServiceChecklist {
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 { if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12 c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
@ -159,18 +152,12 @@ func (c *Config) applyDefaults() {
if c.Startup.NodeSSHAuthPollSeconds <= 0 { if c.Startup.NodeSSHAuthPollSeconds <= 0 {
c.Startup.NodeSSHAuthPollSeconds = 5 c.Startup.NodeSSHAuthPollSeconds = 5
} }
if c.Startup.NodeSSHAuthRequiredNodes == nil {
c.Startup.NodeSSHAuthRequiredNodes = []string{}
}
if c.Startup.FluxHealthWaitSeconds <= 0 { if c.Startup.FluxHealthWaitSeconds <= 0 {
c.Startup.FluxHealthWaitSeconds = 900 c.Startup.FluxHealthWaitSeconds = 900
} }
if c.Startup.FluxHealthPollSeconds <= 0 { if c.Startup.FluxHealthPollSeconds <= 0 {
c.Startup.FluxHealthPollSeconds = 5 c.Startup.FluxHealthPollSeconds = 5
} }
if c.Startup.FluxHealthRequiredKustomizations == nil {
c.Startup.FluxHealthRequiredKustomizations = []string{}
}
if c.Startup.IgnoreFluxKustomizations == nil { if c.Startup.IgnoreFluxKustomizations == nil {
c.Startup.IgnoreFluxKustomizations = []string{} c.Startup.IgnoreFluxKustomizations = []string{}
} }
@ -180,9 +167,6 @@ func (c *Config) applyDefaults() {
if c.Startup.WorkloadConvergencePollSeconds <= 0 { if c.Startup.WorkloadConvergencePollSeconds <= 0 {
c.Startup.WorkloadConvergencePollSeconds = 5 c.Startup.WorkloadConvergencePollSeconds = 5
} }
if c.Startup.WorkloadConvergenceRequiredNamespaces == nil {
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{}
}
if c.Startup.IgnoreWorkloadNamespaces == nil { if c.Startup.IgnoreWorkloadNamespaces == nil {
c.Startup.IgnoreWorkloadNamespaces = []string{} c.Startup.IgnoreWorkloadNamespaces = []string{}
} }
@ -195,12 +179,6 @@ func (c *Config) applyDefaults() {
if c.Startup.StuckPodGraceSeconds <= 0 { if c.Startup.StuckPodGraceSeconds <= 0 {
c.Startup.StuckPodGraceSeconds = 180 c.Startup.StuckPodGraceSeconds = 180
} }
if c.Startup.PostStartAutoHealSeconds <= 0 {
c.Startup.PostStartAutoHealSeconds = 60
}
if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
c.Startup.DeadNodeCleanupGraceSeconds = 300
}
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key" c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
} }
@ -243,12 +221,6 @@ func (c *Config) applyDefaults() {
if c.UPS.TelemetryTimeoutSeconds <= 0 { if c.UPS.TelemetryTimeoutSeconds <= 0 {
c.UPS.TelemetryTimeoutSeconds = 90 c.UPS.TelemetryTimeoutSeconds = 90
} }
if c.Startup.SchedulingStormEventThreshold <= 0 {
c.Startup.SchedulingStormEventThreshold = 30
}
if c.Startup.SchedulingStormWindowSeconds <= 0 {
c.Startup.SchedulingStormWindowSeconds = 180
}
if c.Coordination.ForwardShutdownConfig == "" { if c.Coordination.ForwardShutdownConfig == "" {
c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml" c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
} }

View File

@ -39,25 +39,24 @@ func defaults() Config {
"maintenance", "maintenance",
}, },
Startup: Startup{ Startup: Startup{
APIWaitSeconds: 1200, APIWaitSeconds: 1200,
APIPollSeconds: 2, APIPollSeconds: 2,
ShutdownCooldownSeconds: 45, ShutdownCooldownSeconds: 45,
RequireNodeInventoryReach: true, RequireNodeInventoryReach: true,
NodeInventoryReachWaitSeconds: 300, NodeInventoryReachWaitSeconds: 300,
NodeInventoryReachPollSeconds: 5, NodeInventoryReachPollSeconds: 5,
NodeInventoryReachRequiredNodes: []string{}, RequireTimeSync: true,
RequireTimeSync: true, TimeSyncWaitSeconds: 240,
TimeSyncWaitSeconds: 240, TimeSyncPollSeconds: 5,
TimeSyncPollSeconds: 5, TimeSyncMode: "quorum",
TimeSyncMode: "quorum", TimeSyncQuorum: 2,
TimeSyncQuorum: 2, ReconcileAccessOnBoot: true,
ReconcileAccessOnBoot: true, AutoEtcdRestoreOnAPIFailure: true,
AutoEtcdRestoreOnAPIFailure: true, EtcdRestoreControlPlane: "titan-0a",
EtcdRestoreControlPlane: "titan-0a", RequireStorageReady: true,
RequireStorageReady: true, StorageReadyWaitSeconds: 420,
StorageReadyWaitSeconds: 420, StorageReadyPollSeconds: 5,
StorageReadyPollSeconds: 5, StorageMinReadyNodes: 2,
StorageMinReadyNodes: 2,
StorageCriticalPVCs: []string{ StorageCriticalPVCs: []string{
"vault/data-vault-0", "vault/data-vault-0",
"postgres/postgres-data-postgres-0", "postgres/postgres-data-postgres-0",
@ -92,36 +91,33 @@ func defaults() Config {
AdminSecretUsernameKey: "username", AdminSecretUsernameKey: "username",
AdminSecretPasswordKey: "password", AdminSecretPasswordKey: "password",
}, },
ServiceChecklist: defaultServiceChecklist(), ServiceChecklist: defaultServiceChecklist(),
RequireCriticalServiceEndpoints: true, RequireCriticalServiceEndpoints: true,
CriticalServiceEndpointWaitSec: 420, CriticalServiceEndpointWaitSec: 420,
CriticalServiceEndpointPollSec: 5, CriticalServiceEndpointPollSec: 5,
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(), CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
RequireIngressChecklist: true, RequireIngressChecklist: true,
IngressChecklistWaitSeconds: 420, IngressChecklistWaitSeconds: 420,
IngressChecklistPollSeconds: 5, IngressChecklistPollSeconds: 5,
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404}, IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
IngressChecklistIgnoreHosts: []string{}, IngressChecklistIgnoreHosts: []string{},
RequireNodeSSHAuth: true, RequireNodeSSHAuth: true,
NodeSSHAuthWaitSeconds: 240, NodeSSHAuthWaitSeconds: 240,
NodeSSHAuthPollSeconds: 5, NodeSSHAuthPollSeconds: 5,
NodeSSHAuthRequiredNodes: []string{}, RequireFluxHealth: true,
RequireFluxHealth: true, FluxHealthWaitSeconds: 900,
FluxHealthWaitSeconds: 900, FluxHealthPollSeconds: 5,
FluxHealthPollSeconds: 5, IgnoreFluxKustomizations: []string{},
FluxHealthRequiredKustomizations: []string{}, RequireWorkloadConvergence: true,
IgnoreFluxKustomizations: []string{}, WorkloadConvergenceWaitSeconds: 900,
RequireWorkloadConvergence: true, WorkloadConvergencePollSeconds: 5,
WorkloadConvergenceWaitSeconds: 900, IgnoreWorkloadNamespaces: []string{},
WorkloadConvergencePollSeconds: 5, IgnoreWorkloads: []string{},
WorkloadConvergenceRequiredNamespaces: []string{}, IgnoreUnavailableNodes: []string{},
IgnoreWorkloadNamespaces: []string{}, AutoRecycleStuckPods: true,
IgnoreWorkloads: []string{}, StuckPodGraceSeconds: 180,
IgnoreUnavailableNodes: []string{}, VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
AutoRecycleStuckPods: true, VaultUnsealBreakglassTimeout: 15,
StuckPodGraceSeconds: 180,
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
VaultUnsealBreakglassTimeout: 15,
}, },
Shutdown: Shutdown{ Shutdown: Shutdown{
DefaultBudgetSeconds: 1380, DefaultBudgetSeconds: 1380,

View File

@ -51,41 +51,3 @@ startup:
t.Fatalf("expected validation failure") t.Fatalf("expected validation failure")
} }
} }
// TestLoadKeepsExplicitServiceChecklist runs one orchestration or CLI step.
// Signature: TestLoadKeepsExplicitServiceChecklist(t *testing.T).
// Why: host recovery configs must be able to keep a narrow, explicit checklist
// without silently inheriting the full default service catalog.
func TestLoadKeepsExplicitServiceChecklist(t *testing.T) {
cfgPath := filepath.Join(t.TempDir(), "ananke.yaml")
raw := `
control_planes: [titan-0a]
expected_flux_branch: main
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
iac_repo_path: /opt/titan-iac
startup:
service_checklist_explicit_only: true
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
accepted_statuses: [200]
body_contains: pass
timeout_seconds: 12
ups:
enabled: false
`
if err := os.WriteFile(cfgPath, []byte(raw), 0o644); err != nil {
t.Fatalf("write config: %v", err)
}
cfg, err := Load(cfgPath)
if err != nil {
t.Fatalf("load config: %v", err)
}
if len(cfg.Startup.ServiceChecklist) != 1 {
t.Fatalf("expected 1 explicit service check, got %d", len(cfg.Startup.ServiceChecklist))
}
if cfg.Startup.ServiceChecklist[0].Name != "gitea-api" {
t.Fatalf("expected explicit gitea-api check, got %q", cfg.Startup.ServiceChecklist[0].Name)
}
}

View File

@ -27,75 +27,65 @@ type Config struct {
} }
type Startup struct { type Startup struct {
APIWaitSeconds int `yaml:"api_wait_seconds"` APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"` APIPollSeconds int `yaml:"api_poll_seconds"`
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"` ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"` MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"` RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"` NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"` NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"` RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"` RequireTimeSync bool `yaml:"require_time_sync"`
RequireTimeSync bool `yaml:"require_time_sync"` TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` TimeSyncMode string `yaml:"time_sync_mode"`
TimeSyncMode string `yaml:"time_sync_mode"` TimeSyncQuorum int `yaml:"time_sync_quorum"`
TimeSyncQuorum int `yaml:"time_sync_quorum"` ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` RequireStorageReady bool `yaml:"require_storage_ready"`
RequireStorageReady bool `yaml:"require_storage_ready"` StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` RequirePostStartProbes bool `yaml:"require_post_start_probes"`
RequirePostStartProbes bool `yaml:"require_post_start_probes"` PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` PostStartProbes []string `yaml:"post_start_probes"`
PostStartProbes []string `yaml:"post_start_probes"` RequireServiceChecklist bool `yaml:"require_service_checklist"`
RequireServiceChecklist bool `yaml:"require_service_checklist"` ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"` ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"` ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"` ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"` ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
ServiceChecklistExplicitOnly bool `yaml:"service_checklist_explicit_only"` RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"` CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"` CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"` CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"` RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"` IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
RequireIngressChecklist bool `yaml:"require_ingress_checklist"` IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"` IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"` IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"` IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"` RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"` NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"` NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"` RequireFluxHealth bool `yaml:"require_flux_health"`
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"` FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
NodeSSHAuthRequiredNodes []string `yaml:"node_ssh_auth_required_nodes"` FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
RequireFluxHealth bool `yaml:"require_flux_health"` IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"` RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"` WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
FluxHealthRequiredKustomizations []string `yaml:"flux_health_required_kustomizations"` WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"` IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"` IgnoreWorkloads []string `yaml:"ignore_workloads"`
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"` IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"` AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
WorkloadConvergenceRequiredNamespaces []string `yaml:"workload_convergence_required_namespaces"` StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"` VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
IgnoreWorkloads []string `yaml:"ignore_workloads"` VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"` VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
AutoQuarantineSchedulingStorms bool `yaml:"auto_quarantine_scheduling_storms"`
SchedulingStormEventThreshold int `yaml:"scheduling_storm_event_threshold"`
SchedulingStormWindowSeconds int `yaml:"scheduling_storm_window_seconds"`
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
PostStartAutoHealSeconds int `yaml:"post_start_auto_heal_seconds"`
DeadNodeCleanupGraceSeconds int `yaml:"dead_node_cleanup_grace_seconds"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
} }
type ServiceChecklistCheck struct { type ServiceChecklistCheck struct {
@ -146,7 +136,6 @@ type UPS struct {
Targets []UPSTarget `yaml:"targets"` Targets []UPSTarget `yaml:"targets"`
PollSeconds int `yaml:"poll_seconds"` PollSeconds int `yaml:"poll_seconds"`
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"` RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
OnBatteryGraceSeconds int `yaml:"on_battery_grace_seconds"`
DebounceCount int `yaml:"debounce_count"` DebounceCount int `yaml:"debounce_count"`
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"` TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
} }

View File

@ -61,11 +61,6 @@ func (c Config) Validate() error {
if c.Startup.NodeInventoryReachPollSeconds <= 0 { if c.Startup.NodeInventoryReachPollSeconds <= 0 {
return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0") return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
} }
for _, node := range c.Startup.NodeInventoryReachRequiredNodes {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
}
}
for node, labels := range c.Startup.RequiredNodeLabels { for node, labels := range c.Startup.RequiredNodeLabels {
if strings.TrimSpace(node) == "" { if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.required_node_labels keys must not be empty") return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
@ -238,46 +233,21 @@ func (c Config) Validate() error {
if c.Startup.NodeSSHAuthPollSeconds <= 0 { if c.Startup.NodeSSHAuthPollSeconds <= 0 {
return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0") return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
} }
for _, node := range c.Startup.NodeSSHAuthRequiredNodes {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.node_ssh_auth_required_nodes entries must not be empty")
}
}
if c.Startup.FluxHealthWaitSeconds <= 0 { if c.Startup.FluxHealthWaitSeconds <= 0 {
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0") return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
} }
if c.Startup.FluxHealthPollSeconds <= 0 { if c.Startup.FluxHealthPollSeconds <= 0 {
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0") return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
} }
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
item = strings.TrimSpace(item)
if item == "" {
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must not be empty")
}
if strings.Count(item, "/") != 1 {
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must be namespace/name, got %q", item)
}
}
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 { if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0") return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
} }
if c.Startup.WorkloadConvergencePollSeconds <= 0 { if c.Startup.WorkloadConvergencePollSeconds <= 0 {
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0") return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
} }
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
if strings.TrimSpace(ns) == "" {
return fmt.Errorf("config.startup.workload_convergence_required_namespaces entries must not be empty")
}
}
if c.Startup.StuckPodGraceSeconds <= 0 { if c.Startup.StuckPodGraceSeconds <= 0 {
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0") return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
} }
if c.Startup.PostStartAutoHealSeconds <= 0 {
return fmt.Errorf("config.startup.post_start_auto_heal_seconds must be > 0")
}
if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
return fmt.Errorf("config.startup.dead_node_cleanup_grace_seconds must be > 0")
}
for _, probe := range c.Startup.PostStartProbes { for _, probe := range c.Startup.PostStartProbes {
if strings.TrimSpace(probe) == "" { if strings.TrimSpace(probe) == "" {
return fmt.Errorf("config.startup.post_start_probes entries must not be empty") return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
@ -307,16 +277,6 @@ func (c Config) Validate() error {
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty") return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
} }
} }
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
if containsTrimmed(c.Startup.IgnoreFluxKustomizations, item) {
return fmt.Errorf("config.startup.flux_health_required_kustomizations must not overlap ignore_flux_kustomizations (%q)", strings.TrimSpace(item))
}
}
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
if containsTrimmed(c.Startup.IgnoreWorkloadNamespaces, ns) {
return fmt.Errorf("config.startup.workload_convergence_required_namespaces must not overlap ignore_workload_namespaces (%q)", strings.TrimSpace(ns))
}
}
for _, node := range c.Startup.IgnoreUnavailableNodes { for _, node := range c.Startup.IgnoreUnavailableNodes {
if strings.TrimSpace(node) == "" { if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty") return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
@ -332,9 +292,6 @@ func (c Config) Validate() error {
if c.UPS.Provider == "" { if c.UPS.Provider == "" {
return fmt.Errorf("config.ups.provider must not be empty when ups is enabled") return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
} }
if c.UPS.OnBatteryGraceSeconds < 0 {
return fmt.Errorf("config.ups.on_battery_grace_seconds must be >= 0")
}
if c.UPS.Target == "" && len(c.UPS.Targets) == 0 { if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled") return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
} }
@ -349,14 +306,6 @@ func (c Config) Validate() error {
return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set") return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
} }
} }
if c.Startup.AutoQuarantineSchedulingStorms {
if c.Startup.SchedulingStormEventThreshold <= 0 {
return fmt.Errorf("config.startup.scheduling_storm_event_threshold must be > 0 when auto_quarantine_scheduling_storms is enabled")
}
if c.Startup.SchedulingStormWindowSeconds <= 0 {
return fmt.Errorf("config.startup.scheduling_storm_window_seconds must be > 0 when auto_quarantine_scheduling_storms is enabled")
}
}
for _, peer := range c.Coordination.PeerHosts { for _, peer := range c.Coordination.PeerHosts {
if strings.TrimSpace(peer) == "" { if strings.TrimSpace(peer) == "" {
return fmt.Errorf("config.coordination.peer_hosts entries must not be empty") return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
@ -379,20 +328,3 @@ func (c Config) Validate() error {
} }
return nil return nil
} }
// containsTrimmed runs one orchestration or CLI step.
// Signature: containsTrimmed(entries []string, needle string) bool.
// Why: startup config now supports both required and ignored recovery scopes, so
// validation needs a single normalized overlap check for those lists.
func containsTrimmed(entries []string, needle string) bool {
needle = strings.TrimSpace(needle)
if needle == "" {
return false
}
for _, entry := range entries {
if strings.TrimSpace(entry) == needle {
return true
}
}
return false
}

View File

@ -30,7 +30,6 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }}, {"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }}, {"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }}, {"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }}, {"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }}, {"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }}, {"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
@ -69,42 +68,19 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }}, {"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }}, {"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }}, {"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
{"bad_empty_node_ssh_required_node", func(c *Config) { c.Startup.NodeSSHAuthRequiredNodes = []string{"titan-0a", ""} }},
{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }}, {"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }}, {"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
{"bad_empty_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", ""} }},
{"bad_malformed_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system-core"} }},
{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }}, {"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }}, {"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }}, {"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
{"bad_post_start_auto_heal_seconds", func(c *Config) { c.Startup.PostStartAutoHealSeconds = 0 }},
{"bad_dead_node_cleanup_grace_seconds", func(c *Config) { c.Startup.DeadNodeCleanupGraceSeconds = 0 }},
{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }}, {"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }}, {"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }}, {"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }}, {"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
{"bad_overlap_flux_required_and_ignored", func(c *Config) {
c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core"}
c.Startup.IgnoreFluxKustomizations = []string{"flux-system/core"}
}},
{"bad_overlap_workload_required_and_ignored", func(c *Config) {
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
c.Startup.IgnoreWorkloadNamespaces = []string{"monitoring"}
}},
{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }}, {"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }}, {"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
{"bad_scheduling_storm_threshold", func(c *Config) {
c.Startup.AutoQuarantineSchedulingStorms = true
c.Startup.SchedulingStormEventThreshold = 0
}},
{"bad_scheduling_storm_window", func(c *Config) {
c.Startup.AutoQuarantineSchedulingStorms = true
c.Startup.SchedulingStormWindowSeconds = 0
}},
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }}, {"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
{"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }}, {"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }},
{"bad_ups_on_battery_grace_negative", func(c *Config) { c.UPS.Enabled = true; c.UPS.OnBatteryGraceSeconds = -1 }},
{"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }}, {"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }},
{"bad_ups_targets_item_empty", func(c *Config) { {"bad_ups_targets_item_empty", func(c *Config) {
c.UPS.Enabled = true c.UPS.Enabled = true
@ -145,13 +121,6 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) {
if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" { if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
t.Fatalf("expected startup defaults to be set") t.Fatalf("expected startup defaults to be set")
} }
if cfg.Startup.PostStartAutoHealSeconds <= 0 || cfg.Startup.DeadNodeCleanupGraceSeconds <= 0 {
t.Fatalf("expected post-start auto-heal defaults to be set")
}
if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
t.Fatalf("expected startup recovery scope slices to be initialized")
}
if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 { if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
t.Fatalf("expected critical service endpoint timing defaults to be set") t.Fatalf("expected critical service endpoint timing defaults to be set")
} }

View File

@ -32,8 +32,6 @@ type Daemon struct {
targets []Target targets []Target
log *log.Logger log *log.Logger
exporter *metrics.Exporter exporter *metrics.Exporter
postStartAutoHealOverride func(context.Context) error
} }
var sshConfigCandidates = []string{ var sshConfigCandidates = []string{
@ -94,9 +92,7 @@ func (d *Daemon) Run(ctx context.Context) error {
lastGood := map[string]time.Time{} lastGood := map[string]time.Time{}
lastOnBattery := map[string]bool{} lastOnBattery := map[string]bool{}
onBatterySince := map[string]time.Time{}
breachCount := map[string]int{} breachCount := map[string]int{}
lastAutoHeal := time.Time{}
for _, t := range d.targets { for _, t := range d.targets {
lastGood[t.Name] = time.Now() lastGood[t.Name] = time.Now()
} }
@ -111,16 +107,12 @@ func (d *Daemon) Run(ctx context.Context) error {
case <-t.C: case <-t.C:
budget := d.orch.EstimatedEmergencyShutdownSeconds() budget := d.orch.EstimatedEmergencyShutdownSeconds()
threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor)) threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
anyOnBattery := false
d.exporter.UpdateBudget(budget) d.exporter.UpdateBudget(budget)
for _, target := range d.targets { for _, target := range d.targets {
sample, err := target.Provider.Read(ctx) sample, err := target.Provider.Read(ctx)
if err != nil { if err != nil {
if lastOnBattery[target.Name] {
anyOnBattery = true
}
d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err) d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
d.exporter.UpdateSample(metrics.Sample{ d.exporter.UpdateSample(metrics.Sample{
Name: target.Name, Name: target.Name,
@ -139,45 +131,17 @@ func (d *Daemon) Run(ctx context.Context) error {
} }
lastGood[target.Name] = time.Now() lastGood[target.Name] = time.Now()
if sample.OnBattery {
anyOnBattery = true
}
wasOnBattery := lastOnBattery[target.Name]
if sample.OnBattery {
if !wasOnBattery || onBatterySince[target.Name].IsZero() {
onBatterySince[target.Name] = time.Now()
}
} else {
onBatterySince[target.Name] = time.Time{}
}
lastOnBattery[target.Name] = sample.OnBattery lastOnBattery[target.Name] = sample.OnBattery
onBatteryElapsed := 0 trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
if sample.OnBattery && !onBatterySince[target.Name].IsZero() {
onBatteryElapsed = int(time.Since(onBatterySince[target.Name]).Seconds())
}
trigger := false
triggerReason := ""
switch {
case sample.LowBattery:
trigger = true
triggerReason = fmt.Sprintf("ups-low-battery target=%s status=%s", target.Name, sample.RawStatus)
case sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold:
trigger = true
triggerReason = fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
case sample.OnBattery && d.cfg.UPS.OnBatteryGraceSeconds > 0 && onBatteryElapsed >= d.cfg.UPS.OnBatteryGraceSeconds:
trigger = true
triggerReason = fmt.Sprintf("ups-on-battery target=%s elapsed=%ds grace=%ds status=%s", target.Name, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RawStatus)
}
if trigger { if trigger {
breachCount[target.Name]++ breachCount[target.Name]++
} else { } else {
breachCount[target.Name] = 0 breachCount[target.Name] = 0
} }
d.log.Printf("ups target=%s status=%s on_battery=%t on_battery_s=%d grace_s=%d runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d", d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
target.Name, sample.RawStatus, sample.OnBattery, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name]) target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
d.exporter.UpdateSample(metrics.Sample{ d.exporter.UpdateSample(metrics.Sample{
Name: target.Name, Name: target.Name,
@ -196,54 +160,14 @@ func (d *Daemon) Run(ctx context.Context) error {
}) })
if breachCount[target.Name] >= debounce { if breachCount[target.Name] >= debounce {
return d.triggerShutdown(ctx, triggerReason) reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
return d.triggerShutdown(ctx, reason)
} }
} }
d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery)
} }
} }
} }
// maybeRunPostStartAutoHeal runs one orchestration or CLI step.
// Signature: (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool).
// Why: gives the long-running daemon a bounded path to repair post-start drift
// like a later Vault reseal or stale dead-node deletions without waiting for a
// fresh bootstrap run.
func (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool) {
interval := time.Duration(d.cfg.Startup.PostStartAutoHealSeconds) * time.Second
if interval <= 0 || anyOnBattery {
return
}
if d.orch == nil && d.postStartAutoHealOverride == nil {
return
}
now := time.Now()
if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval {
return
}
if lastRun != nil {
*lastRun = now
}
if err := d.runPostStartAutoHeal(ctx); err != nil {
d.log.Printf("warning: post-start auto-heal: %v", err)
}
}
// runPostStartAutoHeal runs one orchestration or CLI step.
// Signature: (d *Daemon) runPostStartAutoHeal(ctx context.Context) error.
// Why: keeps the daemon loop readable while allowing unit tests to inject a
// deterministic repair hook without a live cluster.
func (d *Daemon) runPostStartAutoHeal(ctx context.Context) error {
if d.postStartAutoHealOverride != nil {
return d.postStartAutoHealOverride(ctx)
}
if d.orch == nil {
return nil
}
return d.orch.RunPostStartAutoHeal(ctx)
}
// triggerShutdown runs one orchestration or CLI step. // triggerShutdown runs one orchestration or CLI step.
// Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error. // Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.

View File

@ -165,50 +165,6 @@ func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) {
} }
} }
// TestDaemonRunTriggersShutdownAfterOnBatteryGrace runs one orchestration or CLI step.
// Signature: TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T).
// Why: covers the sustained-on-battery trigger so short runtime estimates are not
// the only path to a graceful shutdown during abrupt power loss.
func TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T) {
stateDir := t.TempDir()
orch := newDaemonTestOrchestrator(t, stateDir)
d := &Daemon{
cfg: config.Config{
UPS: config.UPS{
Enabled: true,
PollSeconds: 1,
DebounceCount: 1,
RuntimeSafetyFactor: 1.0,
OnBatteryGraceSeconds: 1,
},
State: config.State{
IntentPath: filepath.Join(stateDir, "intent.json"),
},
Shutdown: config.Shutdown{
EmergencySkipDrain: true,
EmergencySkipEtcd: true,
},
},
orch: orch,
targets: []Target{
{
Name: "Pyrphoros",
Target: "pyrphoros@localhost",
Provider: &daemonFakeProvider{
samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}},
},
},
},
log: log.New(io.Discard, "", 0),
exporter: metrics.New(),
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := d.Run(ctx); err != nil {
t.Fatalf("expected sustained-on-battery shutdown path to complete, got %v", err)
}
}
// TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step. // TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
// Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T). // Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
// Why: covers forward-shutdown SSH execution path. // Why: covers forward-shutdown SSH execution path.

View File

@ -1,51 +0,0 @@
package service
import (
"context"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
)
// TestDaemonMaybeRunPostStartAutoHeal runs one orchestration or CLI step.
// Signature: TestDaemonMaybeRunPostStartAutoHeal(t *testing.T).
// Why: covers the daemon-side interval and on-battery guards for the new
// post-start repair loop.
func TestDaemonMaybeRunPostStartAutoHeal(t *testing.T) {
calls := 0
d := &Daemon{
cfg: config.Config{
Startup: config.Startup{
PostStartAutoHealSeconds: 10,
},
},
postStartAutoHealOverride: func(context.Context) error {
calls++
return nil
},
}
var last time.Time
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
if calls != 1 {
t.Fatalf("expected first auto-heal invocation, got %d", calls)
}
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
if calls != 1 {
t.Fatalf("expected interval guard to suppress second call, got %d", calls)
}
last = time.Now().Add(-11 * time.Second)
d.maybeRunPostStartAutoHeal(context.Background(), &last, true)
if calls != 1 {
t.Fatalf("expected on-battery guard to suppress call, got %d", calls)
}
last = time.Now().Add(-11 * time.Second)
d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
if calls != 2 {
t.Fatalf("expected second allowed auto-heal call, got %d", calls)
}
}

View File

@ -22,23 +22,12 @@ type Intent struct {
UpdatedAt time.Time `json:"updated_at"` UpdatedAt time.Time `json:"updated_at"`
} }
var ( var writeIntentImpl = writeIntentDefault
readIntentImpl = readIntentDefault
writeIntentImpl = writeIntentDefault
)
// ReadIntent runs one orchestration or CLI step. // ReadIntent runs one orchestration or CLI step.
// Signature: ReadIntent(path string) (Intent, error). // Signature: ReadIntent(path string) (Intent, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func ReadIntent(path string) (Intent, error) { func ReadIntent(path string) (Intent, error) {
return readIntentImpl(path)
}
// readIntentDefault runs one orchestration or CLI step.
// Signature: readIntentDefault(path string) (Intent, error).
// Why: keeps production read behavior available while tests can override intent
// reads deterministically without racing background file mutations.
func readIntentDefault(path string) (Intent, error) {
b, err := os.ReadFile(path) b, err := os.ReadFile(path)
if err != nil { if err != nil {
if os.IsNotExist(err) { if os.IsNotExist(err) {

View File

@ -22,34 +22,6 @@ func TestHookWriteIntentDefault(path string, in Intent) error {
return writeIntentDefault(path, in) return writeIntentDefault(path, in)
} }
// TestHookReadIntentDefault runs one orchestration or CLI step.
// Signature: TestHookReadIntentDefault(path string) (Intent, error).
// Why: lets top-level tests delegate to production ReadIntent behavior while
// selectively forcing deterministic read sequences for lifecycle branches.
func TestHookReadIntentDefault(path string) (Intent, error) {
return readIntentDefault(path)
}
// TestHookSetReadIntentOverride runs one orchestration or CLI step.
// Signature: TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()).
// Why: enables deterministic intent-read failure injection without sleeping
// goroutines that race slower CI agents.
func TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()) {
testHookOverrideMu.Lock()
prev := readIntentImpl
if fn == nil {
readIntentImpl = readIntentDefault
} else {
readIntentImpl = fn
}
testHookOverrideMu.Unlock()
return func() {
testHookOverrideMu.Lock()
readIntentImpl = prev
testHookOverrideMu.Unlock()
}
}
// TestHookSetWriteIntentOverride runs one orchestration or CLI step. // TestHookSetWriteIntentOverride runs one orchestration or CLI step.
// Signature: TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()). // Signature: TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()).
// Why: enables deterministic intent-write failure injection from the top-level // Why: enables deterministic intent-write failure injection from the top-level

View File

@ -1,116 +0,0 @@
# Binary, config template, and systemd artifact helpers for the installer.
resolve_build_target() {
if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
echo "./cmd/ananke"
return 0
fi
return 1
}
install_config_template() {
local template="$1"
local dest="$2"
local src legacy
local -a modern_candidates=()
local -a legacy_candidates=()
case "${template}" in
coordinator)
modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
legacy_candidates=("configs/hecate.titan-db.yaml")
;;
peer)
modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
legacy_candidates=("configs/hecate.tethys.yaml")
;;
example)
modern_candidates=("configs/ananke.example.yaml")
legacy_candidates=("configs/hecate.example.yaml")
;;
*)
echo "[install] unknown config template key: ${template}" >&2
return 1
;;
esac
for src in "${modern_candidates[@]}"; do
if [[ -f "${src}" ]]; then
install -m 0640 "${src}" "${dest}"
return 0
fi
done
for legacy in "${legacy_candidates[@]}"; do
if [[ -f "${legacy}" ]]; then
src="$(mktemp)"
legacy_path_rewrite "${legacy}" "${src}"
install -m 0640 "${src}" "${dest}"
rm -f "${src}"
return 0
fi
done
echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
return 1
}
install_systemd_units() {
local tmp
while IFS='|' read -r target_name modern_name legacy_name; do
local modern_src="deploy/systemd/${modern_name}"
local legacy_src="deploy/systemd/${legacy_name}"
local target="${SYSTEMD_DIR}/${target_name}"
if [[ -f "${modern_src}" ]]; then
install -m 0644 "${modern_src}" "${target}"
continue
fi
if [[ -f "${legacy_src}" ]]; then
tmp="$(mktemp)"
legacy_path_rewrite "${legacy_src}" "${tmp}"
install -m 0644 "${tmp}" "${target}"
rm -f "${tmp}"
continue
fi
echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
return 1
done <<'EOF_UNITS'
ananke.service|ananke.service|hecate.service
ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
ananke-update.service|ananke-update.service|hecate-update.service
ananke-update.timer|ananke-update.timer|hecate-update.timer
EOF_UNITS
}
install_self_update_script() {
local modern_src="scripts/ananke-self-update.sh"
local legacy_src="scripts/hecate-self-update.sh"
local target="${LIB_DIR}/ananke-self-update.sh"
local tmp
if [[ -f "${modern_src}" ]]; then
install -m 0755 "${modern_src}" "${target}"
return 0
fi
if [[ -f "${legacy_src}" ]]; then
tmp="$(mktemp)"
legacy_path_rewrite "${legacy_src}" "${tmp}"
sed -Ei \
-e 's/HECATE_/ANANKE_/g' \
-e 's/hecate-self-update/ananke-self-update/g' \
-e 's#/opt/hecate#/opt/ananke#g' \
-e 's#bstein/hecate\.git#bstein/ananke.git#g' \
"${tmp}"
install -m 0755 "${tmp}" "${target}"
rm -f "${tmp}"
return 0
fi
echo "[install] missing both modern and legacy self-update scripts." >&2
return 1
}

View File

@ -1,334 +0,0 @@
# Config migration helpers for the Ananke host installer.
read_ananke_role() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
echo "coordinator"
return 0
fi
local role
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -z "${role}" ]]; then
role="coordinator"
fi
echo "${role}"
}
migration_yaml_lookup() {
local key="$1"
awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
first_control_plane_name() {
awk '
/^control_planes:[[:space:]]*$/ {in_list=1; next}
in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
in_list && /^[^[:space:]]/ {in_list=0}
' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
lookup_node_host() {
local node="$1"
awk -F': *' -v n="${node}" '$1 == " " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
migrate_ananke_config() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
return 0
fi
local changed=0
local role_hint
role_hint="$(read_ananke_role)"
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/^ titan-24:[[:space:]]*tethys[[:space:]]*$/ titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated ssh_node_users titan-24 override to atlas"
changed=1
fi
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.startup_guard_max_age_seconds=900"
changed=1
fi
if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei \
-e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
-e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
"${CONF_DIR}/ananke.yaml"
echo "[install] removed deprecated host-poweroff shutdown config keys"
changed=1
fi
if grep -Eq '^ minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ minimum_battery_percent:[[:space:]]*[0-9.]+/a\ require_node_inventory_reachability: true\n node_inventory_reachability_wait_seconds: 300\n node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup node inventory reachability gate defaults"
changed=1
fi
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
echo "[install] added state.reports_dir default"
changed=1
fi
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
local peer_host
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -n "${peer_host}" ]]; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
changed=1
fi
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
changed=1
else
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts empty default"
changed=1
fi
fi
local default_restore_cp
default_restore_cp="$(first_control_plane_name)"
if [[ -z "${default_restore_cp}" ]]; then
default_restore_cp="titan-0a"
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true\n auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
changed=1
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync + access reconciliation defaults"
changed=1
fi
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync quorum defaults"
changed=1
fi
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup storage readiness defaults"
changed=1
fi
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://scm.bstein.dev/api/healthz\n - https://metrics.bstein.dev/api/health\n require_service_checklist: true\n service_checklist_wait_seconds: 420\n service_checklist_poll_seconds: 5\n service_checklist_stability_seconds: 120\n service_checklist:\n - name: gitea-api\n url: https://scm.bstein.dev/api/healthz\n accepted_statuses: [200]\n body_contains: pass\n timeout_seconds: 12\n - name: grafana-api\n url: https://metrics.bstein.dev/api/health\n accepted_statuses: [200]\n body_contains: '\''\"database\":\"ok\"'\''\n timeout_seconds: 12\n vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup post-start probe + vault key fallback defaults"
changed=1
fi
if grep -Eq '^ - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
changed=1
fi
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/api\/health$/a\ vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.vault_unseal_key_file default"
changed=1
fi
fi
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup break-glass fallback defaults"
changed=1
fi
fi
install_cluster_inventory_defaults "${role_hint}" && changed=1
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${CONF_DIR}/ananke.yaml" || true
fi
}
install_cluster_inventory_defaults() {
local role="$1"
local changed=0
local inventory_block=""
local managed_block=""
local workers_block
workers_block='workers:
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
if [[ "${role}" == "coordinator" || "${role}" == "peer" ]]; then
inventory_block='ssh_node_hosts:
titan-db: 192.168.22.10
titan-0a: 192.168.22.11
titan-0b: 192.168.22.12
titan-0c: 192.168.22.13
titan-04: 192.168.22.30
titan-05: 192.168.22.31
titan-06: 192.168.22.32
titan-07: 192.168.22.33
titan-08: 192.168.22.34
titan-09: 192.168.22.35
titan-10: 192.168.22.36
titan-11: 192.168.22.37
titan-12: 192.168.22.40
titan-13: 192.168.22.41
titan-14: 192.168.22.42
titan-15: 192.168.22.43
titan-17: 192.168.22.45
titan-18: 192.168.22.46
titan-19: 192.168.22.47
titan-20: 192.168.22.20
titan-21: 192.168.22.21
titan-22: 192.168.22.22
titan-24: 192.168.22.26'
managed_block='ssh_managed_nodes:
- titan-db
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
fi
if [[ -n "${inventory_block}" ]] && grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
changed=1
fi
if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated workers inventory for startup/shutdown orchestration"
changed=1
fi
if [[ -n "${managed_block}" ]]; then
if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
changed=1
fi
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n(?: - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
changed=1
fi
fi
if [[ "${role}" == "peer" ]]; then
install_peer_inventory_defaults && changed=1
fi
[[ "${changed}" -eq 1 ]]
}
install_peer_inventory_defaults() {
local changed=0
if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
changed=1
fi
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
changed=1
fi
[[ "${changed}" -eq 1 ]]
}
sanitize_migrated_ananke_config() {
local cfg="${CONF_DIR}/ananke.yaml"
[[ -f "${cfg}" ]] || return 0
local tmp changed=0
tmp="$(mktemp)"
# If a legacy migration bug appended root-level node entries after
# ssh_managed_nodes, drop those orphan entries until the next top-level key.
awk '
BEGIN {in_managed=0}
/^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
{
if (in_managed) {
if ($0 ~ /^ - /) {print; next}
if ($0 ~ /^- /) {next}
if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
}
print
}
' "${cfg}" > "${tmp}"
if ! cmp -s "${cfg}" "${tmp}"; then
mv "${tmp}" "${cfg}"
changed=1
echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
else
rm -f "${tmp}"
fi
if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
changed=1
echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
fi
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${cfg}" || true
fi
}

View File

@ -1,239 +0,0 @@
# Host bootstrap helpers for the Ananke installer.
resolve_nut_ups_name() {
if [[ -n "${NUT_UPS_NAME}" ]]; then
return 0
fi
if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
local target=""
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
if [[ -n "${target}" ]]; then
NUT_UPS_NAME="${target%@localhost}"
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
return 0
fi
fi
NUT_UPS_NAME="pyrphoros"
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
}
ensure_ananke_kubeconfig() {
local kubeconfig_path
kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
if [[ -z "${kubeconfig_path}" ]]; then
kubeconfig_path="/etc/ananke/kubeconfig"
fi
install -d -m 0750 "$(dirname "${kubeconfig_path}")"
if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
fi
local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
cp_name="$(first_control_plane_name)"
if [[ -z "${cp_name}" ]]; then
echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
return 0
fi
cp_host="$(lookup_node_host "${cp_name}")"
if [[ -z "${cp_host}" ]]; then
cp_host="${cp_name}"
fi
ssh_user="$(migration_yaml_lookup "ssh_user")"
ssh_port="$(migration_yaml_lookup "ssh_port")"
ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
if [[ -z "${ssh_port}" ]]; then
ssh_port="2277"
fi
local target
target="${cp_host}"
if [[ -n "${ssh_user}" ]]; then
target="${ssh_user}@${cp_host}"
fi
local ssh_args=(
-o BatchMode=yes
-o ConnectTimeout=8
-o StrictHostKeyChecking=accept-new
)
if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
ssh_args+=(-F "${ssh_cfg}")
fi
if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
ssh_args+=(-i "${ssh_key}")
fi
if [[ -n "${ssh_port}" ]]; then
ssh_args+=(-p "${ssh_port}")
fi
local remote_cfg
if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
chmod 0600 "${kubeconfig_path}"
echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
else
echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
fi
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
}
ensure_ananke_ssh_identity() {
local key_path key_dir key_user key_comment
key_path="$(migration_yaml_lookup "ssh_identity_file")"
if [[ -z "${key_path}" ]]; then
key_path="/home/atlas/.ssh/id_ed25519"
fi
key_dir="$(dirname "${key_path}")"
key_comment="ananke-$(hostname)-forward"
key_user="root"
if [[ "${key_path}" == /home/*/* ]]; then
key_user="${key_path#/home/}"
key_user="${key_user%%/*}"
fi
if ! id "${key_user}" >/dev/null 2>&1; then
echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
return 0
fi
install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
if [[ ! -s "${key_path}" ]]; then
echo "[install] generating missing SSH identity at ${key_path}"
if [[ "${key_user}" == "root" ]]; then
ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
else
runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
fi
fi
chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
chmod 0600 "${key_path}" || true
chmod 0644 "${key_path}.pub" || true
}
ensure_apt_packages() {
local missing=()
for pkg in "$@"; do
if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
missing+=("${pkg}")
fi
done
if [[ ${#missing[@]} -eq 0 ]]; then
return 0
fi
echo "[install] apt install: ${missing[*]}"
export DEBIAN_FRONTEND=noninteractive
apt-get update -y
apt-get install -y "${missing[@]}"
}
install_kubectl_if_missing() {
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
ensure_apt_packages kubernetes-client || true
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
echo "[install] installing kubectl via upstream binary"
local arch
arch="$(uname -m)"
case "${arch}" in
x86_64) arch="amd64" ;;
aarch64|arm64) arch="arm64" ;;
*) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
esac
local version
version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
chmod 0755 /usr/local/bin/kubectl
}
ensure_dependencies() {
if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
echo "[install] skipping dependency installation"
return 0
fi
if ! command -v apt-get >/dev/null 2>&1; then
echo "This installer currently supports apt-based hosts only." >&2
exit 1
fi
ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
install_kubectl_if_missing
}
configure_nut() {
if [[ "${MANAGE_NUT}" != "1" ]]; then
echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
return 0
fi
echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
install -d -m 0755 /etc/nut /etc/udev/rules.d
cat > /etc/nut/nut.conf <<EOF
MODE=standalone
EOF
cat > /etc/nut/ups.conf <<EOF
[${NUT_UPS_NAME}]
driver = usbhid-ups
port = auto
vendorid = ${NUT_VENDOR_ID}
productid = ${NUT_PRODUCT_ID}
pollinterval = 5
EOF
cat > /etc/nut/upsd.users <<EOF
[${NUT_MONITOR_USER}]
password = ${NUT_MONITOR_PASSWORD}
upsmon primary
EOF
chmod 0640 /etc/nut/upsd.users
if getent group nut >/dev/null 2>&1; then
chown root:nut /etc/nut/upsd.users
else
chown root:root /etc/nut/upsd.users
fi
cat > /etc/nut/upsmon.conf <<EOF
RUN_AS_USER nut
MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
MINSUPPLIES 1
SHUTDOWNCMD "/sbin/shutdown -h +0"
POLLFREQ 5
POLLFREQALERT 5
HOSTSYNC 15
DEADTIME 15
POWERDOWNFLAG /etc/killpower
EOF
cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
# Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
EOF
udevadm control --reload-rules || true
udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
}

View File

@ -1,98 +0,0 @@
# Legacy Hecate migration helpers for the Ananke installer.
legacy_path_rewrite() {
local src="$1"
local dst="$2"
sed \
-e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
-e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
-e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
-e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
-e 's#/opt/hecate#/opt/ananke#g' \
-e 's#/etc/hecate#/etc/ananke#g' \
-e 's#/var/lib/hecate#/var/lib/ananke#g' \
-e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
-e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
-e 's/hecate.yaml/ananke.yaml/g' \
-e 's/hecate.lock/ananke.lock/g' \
-e 's/hecate/ananke/g' \
-e 's/Hecate/Ananke/g' \
-e 's#hecate\.lock#ananke.lock#g' \
"${src}" > "${dst}"
}
migrate_legacy_hecate_install() {
local legacy_conf_dir="/etc/hecate"
local legacy_state_dir="/var/lib/hecate"
local legacy_systemd_dir="/etc/systemd/system"
install -d -m 0750 "${CONF_DIR}"
install -d -m 0750 "${STATE_DIR}"
if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
chmod 0640 "${CONF_DIR}/ananke.yaml"
fi
if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
fi
if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
fi
if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
fi
if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
fi
if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
fi
if [[ -d "${legacy_systemd_dir}" ]]; then
if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
fi
fi
}
retire_legacy_hecate_install() {
local ts backup_dir
ts="$(date +%Y%m%d%H%M%S)"
backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
systemctl stop hecate-update.service >/dev/null 2>&1 || true
if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
install -d -m 0750 "${backup_dir}"
[[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
[[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
[[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
[[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
[[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
echo "[install] backed up legacy hecate assets to ${backup_dir}"
fi
rm -f \
/etc/systemd/system/hecate.service \
/etc/systemd/system/hecate-bootstrap.service \
/etc/systemd/system/hecate-update.service \
/etc/systemd/system/hecate-update.timer
rm -f /usr/local/bin/hecate
rm -rf /usr/local/lib/hecate
rm -rf /opt/hecate
rm -rf /etc/hecate
rm -rf /var/lib/hecate
}

View File

@ -41,10 +41,829 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
source "${REPO_DIR}/scripts/install-config-migration.sh" resolve_nut_ups_name() {
source "${REPO_DIR}/scripts/install-host-bootstrap.sh" if [[ -n "${NUT_UPS_NAME}" ]]; then
source "${REPO_DIR}/scripts/install-legacy-migration.sh" return 0
source "${REPO_DIR}/scripts/install-artifacts.sh" fi
if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
local target=""
target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
if [[ -n "${target}" ]]; then
NUT_UPS_NAME="${target%@localhost}"
echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
return 0
fi
fi
NUT_UPS_NAME="pyrphoros"
echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
}
read_ananke_role() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
echo "coordinator"
return 0
fi
local role
role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -z "${role}" ]]; then
role="coordinator"
fi
echo "${role}"
}
migration_yaml_lookup() {
local key="$1"
awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
first_control_plane_name() {
awk '
/^control_planes:[[:space:]]*$/ {in_list=1; next}
in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
in_list && /^[^[:space:]]/ {in_list=0}
' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
lookup_node_host() {
local node="$1"
awk -F': *' -v n="${node}" '$1 == " " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
}
ensure_ananke_kubeconfig() {
local kubeconfig_path
kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
if [[ -z "${kubeconfig_path}" ]]; then
kubeconfig_path="/etc/ananke/kubeconfig"
fi
install -d -m 0750 "$(dirname "${kubeconfig_path}")"
if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
fi
local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
cp_name="$(first_control_plane_name)"
if [[ -z "${cp_name}" ]]; then
echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
return 0
fi
cp_host="$(lookup_node_host "${cp_name}")"
if [[ -z "${cp_host}" ]]; then
cp_host="${cp_name}"
fi
ssh_user="$(migration_yaml_lookup "ssh_user")"
ssh_port="$(migration_yaml_lookup "ssh_port")"
ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
if [[ -z "${ssh_port}" ]]; then
ssh_port="2277"
fi
local target
target="${cp_host}"
if [[ -n "${ssh_user}" ]]; then
target="${ssh_user}@${cp_host}"
fi
local ssh_args=(
-o BatchMode=yes
-o ConnectTimeout=8
-o StrictHostKeyChecking=accept-new
)
if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
ssh_args+=(-F "${ssh_cfg}")
fi
if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
ssh_args+=(-i "${ssh_key}")
fi
if [[ -n "${ssh_port}" ]]; then
ssh_args+=(-p "${ssh_port}")
fi
local remote_cfg
if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
chmod 0600 "${kubeconfig_path}"
echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
return 0
fi
else
echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
fi
echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
}
ensure_ananke_ssh_identity() {
local key_path key_dir key_user key_comment
key_path="$(migration_yaml_lookup "ssh_identity_file")"
if [[ -z "${key_path}" ]]; then
key_path="/home/atlas/.ssh/id_ed25519"
fi
key_dir="$(dirname "${key_path}")"
key_comment="ananke-$(hostname)-forward"
key_user="root"
if [[ "${key_path}" == /home/*/* ]]; then
key_user="${key_path#/home/}"
key_user="${key_user%%/*}"
fi
if ! id "${key_user}" >/dev/null 2>&1; then
echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
return 0
fi
install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
if [[ ! -s "${key_path}" ]]; then
echo "[install] generating missing SSH identity at ${key_path}"
if [[ "${key_user}" == "root" ]]; then
ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
else
runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
fi
fi
chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
chmod 0600 "${key_path}" || true
chmod 0644 "${key_path}.pub" || true
}
migrate_ananke_config() {
if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
return 0
fi
local changed=0
local role_hint
role_hint="$(read_ananke_role)"
if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
changed=1
fi
if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei 's/^ titan-24:[[:space:]]*tethys[[:space:]]*$/ titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
echo "[install] migrated ssh_node_users titan-24 override to atlas"
changed=1
fi
if grep -Eq '^ command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ command_timeout_seconds:[[:space:]]*[0-9]+/a\ startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.startup_guard_max_age_seconds=900"
changed=1
fi
if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei \
-e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
-e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
-e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
"${CONF_DIR}/ananke.yaml"
echo "[install] removed deprecated host-poweroff shutdown config keys"
changed=1
fi
if grep -Eq '^ minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ minimum_battery_percent:[[:space:]]*[0-9.]+/a\ require_node_inventory_reachability: true\n node_inventory_reachability_wait_seconds: 300\n node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup node inventory reachability gate defaults"
changed=1
fi
if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ dir:[[:space:]]*\/var\/lib\/ananke$/a\ reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
echo "[install] added state.reports_dir default"
changed=1
fi
if ! grep -Eq '^ peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
if [[ "${role_hint}" == "peer" ]] && grep -Eq '^ forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
local peer_host
peer_host="$(awk -F': *' '/^ forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
if [[ -n "${peer_host}" ]]; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
changed=1
fi
elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^ titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts:\n - titan-24' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
changed=1
else
sed -Ei '/^ forward_shutdown_config:[[:space:]]*.*$/a\ peer_hosts: []' "${CONF_DIR}/ananke.yaml"
echo "[install] added coordination.peer_hosts empty default"
changed=1
fi
fi
local default_restore_cp
default_restore_cp="$(first_control_plane_name)"
if [[ -z "${default_restore_cp}" ]]; then
default_restore_cp="titan-0a"
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true\n auto_etcd_restore_on_api_failure: true\n etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
changed=1
fi
if grep -Eq '^ api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ api_poll_seconds:[[:space:]]*[0-9]+/a\ require_time_sync: true\n time_sync_wait_seconds: 240\n time_sync_poll_seconds: 5\n reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync + access reconciliation defaults"
changed=1
fi
if grep -Eq '^ time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ time_sync_poll_seconds:[[:space:]]*[0-9]+/a\ time_sync_mode: quorum\n time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup time sync quorum defaults"
changed=1
fi
if grep -Eq '^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\ require_storage_ready: true\n storage_ready_wait_seconds: 420\n storage_ready_poll_seconds: 5\n storage_min_ready_nodes: 2\n storage_critical_pvcs:\n - vault/data-vault-0\n - postgres/postgres-data-postgres-0\n - gitea/gitea-data\n - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup storage readiness defaults"
changed=1
fi
if grep -Eq '^ storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - sso\/keycloak-data$/a\ require_post_start_probes: true\n post_start_probe_wait_seconds: 240\n post_start_probe_poll_seconds: 5\n post_start_probes:\n - https://scm.bstein.dev/api/healthz\n - https://metrics.bstein.dev/api/health\n require_service_checklist: true\n service_checklist_wait_seconds: 420\n service_checklist_poll_seconds: 5\n service_checklist_stability_seconds: 120\n service_checklist:\n - name: gitea-api\n url: https://scm.bstein.dev/api/healthz\n accepted_statuses: [200]\n body_contains: pass\n timeout_seconds: 12\n - name: grafana-api\n url: https://metrics.bstein.dev/api/health\n accepted_statuses: [200]\n body_contains: '\''\"database\":\"ok\"'\''\n timeout_seconds: 12\n vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup post-start probe + vault key fallback defaults"
changed=1
fi
if grep -Eq '^ - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
changed=1
fi
if ! grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^ post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ - https:\/\/metrics\.bstein\.dev\/api\/health$/a\ vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup.vault_unseal_key_file default"
changed=1
fi
fi
if ! grep -Eq '^ vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
if grep -Eq '^ vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
sed -Ei '/^ vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\ vault_unseal_breakglass_command: ""\n vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
echo "[install] added startup break-glass fallback defaults"
changed=1
fi
fi
local role
role="$(read_ananke_role)"
local inventory_block
local managed_block
local workers_block
workers_block='workers:
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
if [[ "${role}" == "coordinator" ]]; then
inventory_block='ssh_node_hosts:
titan-db: 192.168.22.10
titan-0a: 192.168.22.11
titan-0b: 192.168.22.12
titan-0c: 192.168.22.13
titan-04: 192.168.22.30
titan-05: 192.168.22.31
titan-06: 192.168.22.32
titan-07: 192.168.22.33
titan-08: 192.168.22.34
titan-09: 192.168.22.35
titan-10: 192.168.22.36
titan-11: 192.168.22.37
titan-12: 192.168.22.40
titan-13: 192.168.22.41
titan-14: 192.168.22.42
titan-15: 192.168.22.43
titan-17: 192.168.22.45
titan-18: 192.168.22.46
titan-19: 192.168.22.47
titan-20: 192.168.22.20
titan-21: 192.168.22.21
titan-22: 192.168.22.22
titan-24: 192.168.22.26'
managed_block='ssh_managed_nodes:
- titan-db
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
elif [[ "${role}" == "peer" ]]; then
inventory_block='ssh_node_hosts:
titan-db: 192.168.22.10
titan-0a: 192.168.22.11
titan-0b: 192.168.22.12
titan-0c: 192.168.22.13
titan-04: 192.168.22.30
titan-05: 192.168.22.31
titan-06: 192.168.22.32
titan-07: 192.168.22.33
titan-08: 192.168.22.34
titan-09: 192.168.22.35
titan-10: 192.168.22.36
titan-11: 192.168.22.37
titan-12: 192.168.22.40
titan-13: 192.168.22.41
titan-14: 192.168.22.42
titan-15: 192.168.22.43
titan-17: 192.168.22.45
titan-18: 192.168.22.46
titan-19: 192.168.22.47
titan-20: 192.168.22.20
titan-21: 192.168.22.21
titan-22: 192.168.22.22
titan-24: 192.168.22.26'
managed_block='ssh_managed_nodes:
- titan-db
- titan-0a
- titan-0b
- titan-0c
- titan-04
- titan-05
- titan-06
- titan-07
- titan-08
- titan-09
- titan-10
- titan-11
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
- titan-20
- titan-21
- titan-22
- titan-24'
fi
if [[ -n "${inventory_block}" ]]; then
if grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
changed=1
fi
fi
if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated workers inventory for startup/shutdown orchestration"
changed=1
fi
if [[ -n "${managed_block}" ]]; then
if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
changed=1
fi
if ! grep -Eq '^ - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - titan-21$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n(?: - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
changed=1
fi
fi
if [[ "${role}" == "peer" ]]; then
if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-db$' "${CONF_DIR}/ananke.yaml" \
&& grep -Eq '^ - titan-24$' "${CONF_DIR}/ananke.yaml" \
&& ! grep -Eq '^ - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#ssh_managed_nodes:\n - titan-db\n - titan-24\n#ssh_managed_nodes:\n - titan-db\n - titan-0a\n - titan-0b\n - titan-0c\n - titan-04\n - titan-05\n - titan-06\n - titan-07\n - titan-08\n - titan-09\n - titan-10\n - titan-11\n - titan-12\n - titan-13\n - titan-14\n - titan-15\n - titan-17\n - titan-18\n - titan-19\n - titan-20\n - titan-21\n - titan-22\n - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
changed=1
fi
if ! grep -Eq '^ - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^ - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
perl -0pi -e 's#local_bootstrap_paths:\n(?: - [^\n]*\n)*#local_bootstrap_paths:\n - infrastructure/core\n - clusters/atlas/flux-system\n - infrastructure/sources/helm\n - infrastructure/metallb\n - infrastructure/traefik\n - infrastructure/cert-manager\n - infrastructure/vault-csi\n - infrastructure/vault-injector\n - services/vault\n - infrastructure/postgres\n - services/gitea\n - services/keycloak\n - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
changed=1
fi
fi
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${CONF_DIR}/ananke.yaml" || true
fi
}
sanitize_migrated_ananke_config() {
local cfg="${CONF_DIR}/ananke.yaml"
[[ -f "${cfg}" ]] || return 0
local tmp changed=0
tmp="$(mktemp)"
# Legacy migration bug guard:
# If root-level "- node" entries were accidentally appended after ssh_managed_nodes,
# drop those orphan entries until the next top-level key.
awk '
BEGIN {in_managed=0}
/^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
{
if (in_managed) {
if ($0 ~ /^ - /) {print; next}
if ($0 ~ /^- /) {next}
if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
}
print
}
' "${cfg}" > "${tmp}"
if ! cmp -s "${cfg}" "${tmp}"; then
mv "${tmp}" "${cfg}"
changed=1
echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
else
rm -f "${tmp}"
fi
if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
changed=1
echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
fi
if [[ "${changed}" -eq 1 ]]; then
chmod 0640 "${cfg}" || true
fi
}
ensure_apt_packages() {
local missing=()
for pkg in "$@"; do
if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
missing+=("${pkg}")
fi
done
if [[ ${#missing[@]} -eq 0 ]]; then
return 0
fi
echo "[install] apt install: ${missing[*]}"
export DEBIAN_FRONTEND=noninteractive
apt-get update -y
apt-get install -y "${missing[@]}"
}
install_kubectl_if_missing() {
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
ensure_apt_packages kubernetes-client || true
if command -v kubectl >/dev/null 2>&1; then
return 0
fi
echo "[install] installing kubectl via upstream binary"
local arch
arch="$(uname -m)"
case "${arch}" in
x86_64) arch="amd64" ;;
aarch64|arm64) arch="arm64" ;;
*) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
esac
local version
version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
chmod 0755 /usr/local/bin/kubectl
}
ensure_dependencies() {
if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
echo "[install] skipping dependency installation"
return 0
fi
if ! command -v apt-get >/dev/null 2>&1; then
echo "This installer currently supports apt-based hosts only." >&2
exit 1
fi
ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
install_kubectl_if_missing
}
legacy_path_rewrite() {
local src="$1"
local dst="$2"
sed \
-e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
-e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
-e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
-e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
-e 's#/opt/hecate#/opt/ananke#g' \
-e 's#/etc/hecate#/etc/ananke#g' \
-e 's#/var/lib/hecate#/var/lib/ananke#g' \
-e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
-e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
-e 's/hecate.yaml/ananke.yaml/g' \
-e 's/hecate.lock/ananke.lock/g' \
-e 's/hecate/ananke/g' \
-e 's/Hecate/Ananke/g' \
-e 's#hecate\.lock#ananke.lock#g' \
"${src}" > "${dst}"
}
migrate_legacy_hecate_install() {
local legacy_conf_dir="/etc/hecate"
local legacy_state_dir="/var/lib/hecate"
local legacy_systemd_dir="/etc/systemd/system"
install -d -m 0750 "${CONF_DIR}"
install -d -m 0750 "${STATE_DIR}"
if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
chmod 0640 "${CONF_DIR}/ananke.yaml"
fi
if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
fi
if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
fi
if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
fi
if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
fi
if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
fi
if [[ -d "${legacy_systemd_dir}" ]]; then
if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
fi
fi
}
retire_legacy_hecate_install() {
local ts backup_dir
ts="$(date +%Y%m%d%H%M%S)"
backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
systemctl stop hecate-update.service >/dev/null 2>&1 || true
if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
install -d -m 0750 "${backup_dir}"
[[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
[[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
[[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
[[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
[[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
echo "[install] backed up legacy hecate assets to ${backup_dir}"
fi
rm -f \
/etc/systemd/system/hecate.service \
/etc/systemd/system/hecate-bootstrap.service \
/etc/systemd/system/hecate-update.service \
/etc/systemd/system/hecate-update.timer
rm -f /usr/local/bin/hecate
rm -rf /usr/local/lib/hecate
rm -rf /opt/hecate
rm -rf /etc/hecate
rm -rf /var/lib/hecate
}
resolve_build_target() {
if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
echo "./cmd/ananke"
return 0
fi
return 1
}
install_config_template() {
local template="$1"
local dest="$2"
local src legacy
local -a modern_candidates=()
local -a legacy_candidates=()
case "${template}" in
coordinator)
modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
legacy_candidates=("configs/hecate.titan-db.yaml")
;;
peer)
modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
legacy_candidates=("configs/hecate.tethys.yaml")
;;
example)
modern_candidates=("configs/ananke.example.yaml")
legacy_candidates=("configs/hecate.example.yaml")
;;
*)
echo "[install] unknown config template key: ${template}" >&2
return 1
;;
esac
for src in "${modern_candidates[@]}"; do
if [[ -f "${src}" ]]; then
install -m 0640 "${src}" "${dest}"
return 0
fi
done
for legacy in "${legacy_candidates[@]}"; do
if [[ -f "${legacy}" ]]; then
src="$(mktemp)"
legacy_path_rewrite "${legacy}" "${src}"
install -m 0640 "${src}" "${dest}"
rm -f "${src}"
return 0
fi
done
echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
return 1
}
install_systemd_units() {
local source_map
local tmp
while IFS='|' read -r target_name modern_name legacy_name; do
local modern_src="deploy/systemd/${modern_name}"
local legacy_src="deploy/systemd/${legacy_name}"
local target="${SYSTEMD_DIR}/${target_name}"
if [[ -f "${modern_src}" ]]; then
install -m 0644 "${modern_src}" "${target}"
continue
fi
if [[ -f "${legacy_src}" ]]; then
tmp="$(mktemp)"
legacy_path_rewrite "${legacy_src}" "${tmp}"
install -m 0644 "${tmp}" "${target}"
rm -f "${tmp}"
continue
fi
echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
return 1
done <<'EOF_UNITS'
ananke.service|ananke.service|hecate.service
ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
ananke-update.service|ananke-update.service|hecate-update.service
ananke-update.timer|ananke-update.timer|hecate-update.timer
EOF_UNITS
}
install_self_update_script() {
local modern_src="scripts/ananke-self-update.sh"
local legacy_src="scripts/hecate-self-update.sh"
local target="${LIB_DIR}/ananke-self-update.sh"
local tmp
if [[ -f "${modern_src}" ]]; then
install -m 0755 "${modern_src}" "${target}"
return 0
fi
if [[ -f "${legacy_src}" ]]; then
tmp="$(mktemp)"
legacy_path_rewrite "${legacy_src}" "${tmp}"
sed -Ei \
-e 's/HECATE_/ANANKE_/g' \
-e 's/hecate-self-update/ananke-self-update/g' \
-e 's#/opt/hecate#/opt/ananke#g' \
-e 's#bstein/hecate\.git#bstein/ananke.git#g' \
"${tmp}"
install -m 0755 "${tmp}" "${target}"
rm -f "${tmp}"
return 0
fi
echo "[install] missing both modern and legacy self-update scripts." >&2
return 1
}
configure_nut() {
if [[ "${MANAGE_NUT}" != "1" ]]; then
echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
return 0
fi
echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
install -d -m 0755 /etc/nut /etc/udev/rules.d
cat > /etc/nut/nut.conf <<EOF
MODE=standalone
EOF
cat > /etc/nut/ups.conf <<EOF
[${NUT_UPS_NAME}]
driver = usbhid-ups
port = auto
vendorid = ${NUT_VENDOR_ID}
productid = ${NUT_PRODUCT_ID}
pollinterval = 5
EOF
cat > /etc/nut/upsd.users <<EOF
[${NUT_MONITOR_USER}]
password = ${NUT_MONITOR_PASSWORD}
upsmon primary
EOF
chmod 0640 /etc/nut/upsd.users
if getent group nut >/dev/null 2>&1; then
chown root:nut /etc/nut/upsd.users
else
chown root:root /etc/nut/upsd.users
fi
cat > /etc/nut/upsmon.conf <<EOF
RUN_AS_USER nut
MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
MINSUPPLIES 1
SHUTDOWNCMD "/sbin/shutdown -h +0"
POLLFREQ 5
POLLFREQALERT 5
HOSTSYNC 15
DEADTIME 15
POWERDOWNFLAG /etc/killpower
EOF
cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
# Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
EOF
udevadm control --reload-rules || true
udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
}
ensure_dependencies ensure_dependencies
migrate_legacy_hecate_install migrate_legacy_hecate_install

View File

@ -6,28 +6,9 @@ cd "${REPO_DIR}"
export PATH="$(go env GOPATH)/bin:${PATH}" export PATH="$(go env GOPATH)/bin:${PATH}"
STATICCHECK_VERSION="${ANANKE_STATICCHECK_VERSION:-2025.1.1}" STATICCHECK_VERSION="${ANANKE_STATICCHECK_VERSION:-2025.1.1}"
run_with_retry() {
local attempts="$1"
shift
local try=1
local delay=3
local rc=0
while true; do
"$@" && return 0
rc=$?
if [[ "${try}" -ge "${attempts}" ]]; then
return "${rc}"
fi
echo "[lint] retry ${try}/${attempts} after rc=${rc}: $*" >&2
sleep "${delay}"
delay=$((delay * 2))
try=$((try + 1))
done
}
if ! command -v staticcheck >/dev/null 2>&1 || ! staticcheck -version 2>/dev/null | grep -q "${STATICCHECK_VERSION}"; then if ! command -v staticcheck >/dev/null 2>&1 || ! staticcheck -version 2>/dev/null | grep -q "${STATICCHECK_VERSION}"; then
echo "[lint] installing staticcheck ${STATICCHECK_VERSION}" echo "[lint] installing staticcheck ${STATICCHECK_VERSION}"
run_with_retry 4 go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}" go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}"
fi fi
echo "[lint] go vet" echo "[lint] go vet"

View File

@ -77,17 +77,6 @@ def _fetch_existing_counter(pushgateway_url: str, metric: str, labels: dict[str,
return 0.0 return 0.0
def _series_exists(pushgateway_url: str, metric: str, labels: dict[str, str], timeout_seconds: float) -> bool:
"""Return whether Pushgateway already has a series for this build."""
text = _read_http(f"{pushgateway_url.rstrip('/')}/metrics", timeout_seconds)
for line in text.splitlines():
if not line.startswith(metric + "{"):
continue
if all(f'{key}="{value}"' in line for key, value in labels.items()):
return True
return False
def _build_payload( def _build_payload(
suite: str, suite: str,
trigger: str, trigger: str,
@ -100,25 +89,9 @@ def _build_payload(
tests_skipped: int, tests_skipped: int,
test_cases: list[tuple[str, str]], test_cases: list[tuple[str, str]],
coverage_percent: float, coverage_percent: float,
source_files_total: int,
source_lines_over_500: int, source_lines_over_500: int,
branch: str,
build_number: str,
jenkins_job: str,
checks: dict[str, str], checks: dict[str, str],
) -> str: ) -> str:
build_labels = {
"suite": suite,
"branch": branch,
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job,
}
test_case_base_labels = {
"suite": suite,
"branch": branch,
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job,
}
lines = [ lines = [
"# TYPE platform_quality_gate_runs_total counter", "# TYPE platform_quality_gate_runs_total counter",
f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count}', f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count}',
@ -132,30 +105,21 @@ def _build_payload(
f'ananke_quality_gate_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}', f'ananke_quality_gate_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
"# TYPE platform_quality_gate_workspace_line_coverage_percent gauge", "# TYPE platform_quality_gate_workspace_line_coverage_percent gauge",
f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}', f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
"# TYPE platform_quality_gate_source_files_total gauge",
f'platform_quality_gate_source_files_total{{suite="{suite}"}} {source_files_total}',
"# TYPE platform_quality_gate_source_lines_over_500_total gauge", "# TYPE platform_quality_gate_source_lines_over_500_total gauge",
f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}', f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
"# TYPE platform_quality_gate_build_info gauge", "# TYPE platform_quality_gate_test_case_result gauge",
f"platform_quality_gate_build_info{_label_str(build_labels)} 1",
"# TYPE ananke_quality_gate_checks_total gauge", "# TYPE ananke_quality_gate_checks_total gauge",
"# TYPE ananke_quality_gate_publish_info gauge", "# TYPE ananke_quality_gate_publish_info gauge",
f'ananke_quality_gate_publish_info{_label_str({"suite": suite, "trigger": trigger})} 1', f'ananke_quality_gate_publish_info{_label_str({"suite": suite, "trigger": trigger})} 1',
] ]
lines.extend(
f'platform_quality_gate_test_case_result{{suite="{suite}",test="{_escape_label(test_name)}",status="{_escape_label(test_status)}"}} 1'
for test_name, test_status in test_cases
)
lines.extend( lines.extend(
f'ananke_quality_gate_checks_total{{suite="{suite}",check="{check_name}",result="{check_status}"}} 1' f'ananke_quality_gate_checks_total{{suite="{suite}",check="{check_name}",result="{check_status}"}} 1'
for check_name, check_status in checks.items() for check_name, check_status in checks.items()
) )
lines.append("# TYPE platform_quality_gate_test_case_result gauge")
if test_cases:
lines.extend(
f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': test_name, 'status': test_status})} 1"
for test_name, test_status in test_cases
)
else:
lines.append(
f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': '__no_test_cases__', 'status': 'skipped'})} 1"
)
return "\n".join(lines) + "\n" return "\n".join(lines) + "\n"
@ -172,7 +136,8 @@ def _read_coverage_percent(path: str) -> float:
return 0.0 return 0.0
def _iter_source_files(repo_root: Path): def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
count = 0
for rel_root in SOURCE_SCAN_ROOTS: for rel_root in SOURCE_SCAN_ROOTS:
base = repo_root / rel_root base = repo_root / rel_root
if not base.exists(): if not base.exists():
@ -182,37 +147,12 @@ def _iter_source_files(repo_root: Path):
continue continue
if path.suffix not in SOURCE_EXTENSIONS: if path.suffix not in SOURCE_EXTENSIONS:
continue continue
if path.name.endswith("_test.go") or path.name.endswith(".test.py"): lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
continue if lines > max_lines:
yield path count += 1
def _count_source_files(repo_root: Path) -> int:
return sum(1 for _ in _iter_source_files(repo_root))
def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
count = 0
for path in _iter_source_files(repo_root):
lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
if lines > max_lines:
count += 1
return count return count
def _unit_tests_failed(output_path: Path, coverage_percent: float) -> bool:
if coverage_percent <= 0 or not output_path.exists():
return True
text = output_path.read_text(encoding="utf-8", errors="ignore")
start_marker = "[quality] unit tests + workspace coverage profile"
end_marker = "[quality] hygiene: doc contracts"
if start_marker in text:
text = text.split(start_marker, 1)[1]
if end_marker in text:
text = text.split(end_marker, 1)[0]
return bool(re.search(r"^(--- FAIL:|FAIL\\b)", text, flags=re.M))
def _parse_go_test_counts(output_path: Path) -> dict[str, int]: def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
if not output_path.exists(): if not output_path.exists():
return {"passed": 0, "failed": 0, "errors": 0, "skipped": 0} return {"passed": 0, "failed": 0, "errors": 0, "skipped": 0}
@ -226,37 +166,14 @@ def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
def _parse_go_test_cases(output_path: Path) -> list[tuple[str, str]]: def _parse_go_test_cases(output_path: Path) -> list[tuple[str, str]]:
"""Parse per-test status records from go test output text."""
if not output_path.exists(): if not output_path.exists():
return [] return []
text = output_path.read_text(encoding="utf-8", errors="ignore") text = output_path.read_text(encoding="utf-8", errors="ignore")
cases: list[tuple[str, str]] = [] cases: list[tuple[str, str]] = []
patterns = { for match in re.finditer(r"^---\s+(PASS|FAIL|SKIP):\s+(\S+)", text, flags=re.M):
"passed": re.compile(r"^--- PASS: ([^\s(]+)", flags=re.M), raw_status, test_name = match.groups()
"failed": re.compile(r"^--- FAIL: ([^\s(]+)", flags=re.M), status = {"PASS": "passed", "FAIL": "failed", "SKIP": "skipped"}.get(raw_status, "error")
"skipped": re.compile(r"^--- SKIP: ([^\s(]+)", flags=re.M), cases.append((test_name.strip(), status))
}
for status, pattern in patterns.items():
for test_name in pattern.findall(text):
cleaned = str(test_name).strip()
if cleaned:
cases.append((cleaned, status))
if cases:
return cases
# Fallback for non-verbose `go test` output where individual test names are absent.
package_cases: list[tuple[str, str]] = []
for package_name in re.findall(r"^ok\s+([^\s]+)", text, flags=re.M):
cleaned = str(package_name).strip()
if cleaned:
package_cases.append((f"package::{cleaned}", "passed"))
for package_name in re.findall(r"^FAIL\s+([^\s]+)", text, flags=re.M):
cleaned = str(package_name).strip()
if cleaned:
package_cases.append((f"package::{cleaned}", "failed"))
if package_cases:
deduped = list(dict.fromkeys(package_cases))
return deduped
return cases return cases
@ -307,23 +224,17 @@ def _sonarqube_check_status(build_dir: Path) -> str:
def _supply_chain_check_status(build_dir: Path) -> str: def _supply_chain_check_status(build_dir: Path) -> str:
required = os.getenv("QUALITY_GATE_IRONBANK_REQUIRED", "0").strip().lower() in {"1", "true", "yes", "on"}
report = _load_json(Path(os.getenv("QUALITY_GATE_IRONBANK_REPORT", str(build_dir / "ironbank-compliance.json")))) report = _load_json(Path(os.getenv("QUALITY_GATE_IRONBANK_REPORT", str(build_dir / "ironbank-compliance.json"))))
if not report: if not report:
return "failed" if required else "not_applicable" return "not_applicable"
compliant = report.get("compliant") compliant = report.get("compliant")
if isinstance(compliant, bool): if isinstance(compliant, bool):
return "ok" if compliant else "failed" return "ok" if compliant else "failed"
status_candidates = [report.get("status"), report.get("result"), report.get("compliance")] status_candidates = [report.get("status"), report.get("result"), report.get("compliance")]
for value in status_candidates: for value in status_candidates:
if isinstance(value, str): if isinstance(value, str):
normalized = value.strip().lower() return "ok" if value.strip().lower() in QUALITY_SUCCESS_STATES else "failed"
if normalized in QUALITY_SUCCESS_STATES: return "failed"
return "ok"
if normalized in {"n/a", "na", "not_applicable", "not-applicable", "skipped", "skip"}:
return "failed" if required else "not_applicable"
return "failed" if required else "not_applicable"
return "failed" if required else "not_applicable"
def parse_args(argv: list[str]) -> argparse.Namespace: def parse_args(argv: list[str]) -> argparse.Namespace:
@ -367,19 +278,10 @@ def main(argv: list[str] | None = None) -> int:
args = parse_args(argv or sys.argv[1:]) args = parse_args(argv or sys.argv[1:])
repo_root = Path(__file__).resolve().parents[1] repo_root = Path(__file__).resolve().parents[1]
build_dir = repo_root / "build" build_dir = repo_root / "build"
gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
current_ok = 1 if gate_rc == 0 else 0
current_failed = 0 if gate_rc == 0 else 1
branch = os.getenv("BRANCH_NAME") or os.getenv("GIT_BRANCH") or "unknown"
if branch.startswith("origin/"):
branch = branch[len("origin/") :]
build_number = os.getenv("BUILD_NUMBER", "")
jenkins_job = os.getenv("JOB_NAME", "ananke")
remote_ok = 0 remote_ok = 0
remote_failed = 0 remote_failed = 0
remote_error = "" remote_error = ""
already_recorded = False
try: try:
remote_ok = int( remote_ok = int(
_fetch_existing_counter( _fetch_existing_counter(
@ -397,39 +299,21 @@ def main(argv: list[str] | None = None) -> int:
args.timeout_seconds, args.timeout_seconds,
) )
) )
already_recorded = bool(build_number) and _series_exists(
args.pushgateway_url,
"platform_quality_gate_build_info",
{
"job": args.job_name,
"suite": args.suite,
"branch": branch or "unknown",
"build_number": build_number or "unknown",
"jenkins_job": jenkins_job,
},
args.timeout_seconds,
)
except Exception as exc: except Exception as exc:
remote_error = str(exc) remote_error = str(exc)
resolved_ok = remote_ok resolved_ok = max(args.local_ok, remote_ok)
resolved_failed = remote_failed resolved_failed = max(args.local_failed, remote_failed)
if remote_error:
resolved_ok = args.local_ok
resolved_failed = args.local_failed
elif not already_recorded:
resolved_ok += current_ok
resolved_failed += current_failed
coverage_percent = _read_coverage_percent(args.coverage_percent_file) coverage_percent = _read_coverage_percent(args.coverage_percent_file)
source_files_total = _count_source_files(repo_root)
source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500) source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500)
quality_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out"))) test_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
tests = _parse_go_test_counts(quality_output) tests = _parse_go_test_counts(test_output)
test_cases = _parse_go_test_cases(quality_output) test_cases = _parse_go_test_cases(test_output)
gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status")))) docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status"))))
unit_tests_failed = _unit_tests_failed(quality_output, coverage_percent) gate_failed = gate_rc != 0
checks = { checks = {
"tests": "failed" if unit_tests_failed or tests["failed"] > 0 or tests["errors"] > 0 else "ok", "tests": "failed" if gate_failed or tests["failed"] > 0 else "ok",
"coverage": "ok" if coverage_percent >= 95.0 else "failed", "coverage": "ok" if coverage_percent >= 95.0 else "failed",
"loc": "ok" if source_lines_over_500 == 0 else "failed", "loc": "ok" if source_lines_over_500 == 0 else "failed",
"docs_naming": docs_status, "docs_naming": docs_status,
@ -448,11 +332,7 @@ def main(argv: list[str] | None = None) -> int:
tests_skipped=tests["skipped"], tests_skipped=tests["skipped"],
test_cases=test_cases, test_cases=test_cases,
coverage_percent=coverage_percent, coverage_percent=coverage_percent,
source_files_total=source_files_total,
source_lines_over_500=source_lines_over_500, source_lines_over_500=source_lines_over_500,
branch=branch,
build_number=build_number,
jenkins_job=jenkins_job,
checks=checks, checks=checks,
) )
@ -465,8 +345,7 @@ def main(argv: list[str] | None = None) -> int:
summary = ( summary = (
f"[quality] published Pushgateway metrics suite={args.suite} job={args.job_name} ok={resolved_ok} " f"[quality] published Pushgateway metrics suite={args.suite} job={args.job_name} ok={resolved_ok} "
f"failed={resolved_failed} coverage={coverage_percent:.3f} source_files_total={source_files_total} " f"failed={resolved_failed} coverage={coverage_percent:.3f} source_lines_over_500={source_lines_over_500}"
f"source_lines_over_500={source_lines_over_500}"
) )
if remote_error: if remote_error:
summary += f" remote_read_error={remote_error}" summary += f" remote_read_error={remote_error}"

View File

@ -3,11 +3,8 @@
from __future__ import annotations from __future__ import annotations
import http.server import http.server
from pathlib import Path
import socketserver import socketserver
import tempfile
import threading import threading
from unittest import mock
import unittest import unittest
import publish_quality_metrics as publisher import publish_quality_metrics as publisher
@ -61,19 +58,7 @@ class PublishQualityMetricsTest(unittest.TestCase):
self.server.server_close() self.server.server_close()
self.thread.join(timeout=5) self.thread.join(timeout=5)
def _env_for_gate_status(self, status: int = 0) -> dict[str, str]: def test_publish_uses_remote_high_water_mark(self) -> None:
tmp_dir = tempfile.TemporaryDirectory()
self.addCleanup(tmp_dir.cleanup)
rc_path = Path(tmp_dir.name) / "quality-gate.rc"
rc_path.write_text(f"{status}\n", encoding="utf-8")
return {
"ANANKE_QUALITY_EXIT_CODE_PATH": str(rc_path),
"ANANKE_QUALITY_COVERAGE_PERCENT_FILE": str(Path(tmp_dir.name) / "coverage.txt"),
"ANANKE_QUALITY_OUTPUT_FILE": str(Path(tmp_dir.name) / "quality-gate.out"),
"ANANKE_QUALITY_DOCS_STATUS_PATH": str(Path(tmp_dir.name) / "docs-naming.status"),
}
def test_publish_adds_current_run_to_remote_counters(self) -> None:
_GatewayHandler.metrics_text = "\n".join( _GatewayHandler.metrics_text = "\n".join(
[ [
'# TYPE platform_quality_gate_runs_total counter', '# TYPE platform_quality_gate_runs_total counter',
@ -82,93 +67,51 @@ class PublishQualityMetricsTest(unittest.TestCase):
] ]
) )
with mock.patch.dict("os.environ", self._env_for_gate_status(0)): exit_code = publisher.main(
exit_code = publisher.main( [
[ "--pushgateway-url",
"--pushgateway-url", self.base_url,
self.base_url, "--job-name",
"--job-name", "platform-quality-ci",
"platform-quality-ci", "--suite",
"--suite", "ananke",
"ananke", "--trigger",
"--trigger", "host",
"host", "--local-ok",
"--local-ok", "5",
"5", "--local-failed",
"--local-failed", "2",
"2", ]
] )
)
self.assertEqual(exit_code, 0) self.assertEqual(exit_code, 0)
self.assertEqual(len(_GatewayHandler.posts), 1) self.assertEqual(len(_GatewayHandler.posts), 1)
path, body = _GatewayHandler.posts[0] path, body = _GatewayHandler.posts[0]
self.assertEqual(path, "/metrics/job/platform-quality-ci/suite/ananke") self.assertEqual(path, "/metrics/job/platform-quality-ci/suite/ananke")
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 8', body) self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body) self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 2', body)
self.assertIn('ananke_quality_gate_publish_info{suite="ananke",trigger="host"} 1', body) self.assertIn('ananke_quality_gate_publish_info{suite="ananke",trigger="host"} 1', body)
self.assertIn('ananke_quality_gate_coverage_percent{suite="ananke"}', body) self.assertIn('ananke_quality_gate_coverage_percent{suite="ananke"}', body)
self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body) self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body) self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)
def test_publish_does_not_double_count_same_build(self) -> None:
_GatewayHandler.metrics_text = "\n".join(
[
'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="ok"} 7',
'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="failed"} 1',
'platform_quality_gate_build_info{job="platform-quality-ci",suite="ananke",branch="main",build_number="78",jenkins_job="ananke"} 1',
]
)
with mock.patch.dict(
"os.environ",
{
**self._env_for_gate_status(0),
"BRANCH_NAME": "main",
"BUILD_NUMBER": "78",
"JOB_NAME": "ananke",
},
):
exit_code = publisher.main(
[
"--pushgateway-url",
self.base_url,
"--job-name",
"platform-quality-ci",
"--suite",
"ananke",
"--trigger",
"host",
"--local-ok",
"1",
"--local-failed",
"0",
]
)
self.assertEqual(exit_code, 0)
_, body = _GatewayHandler.posts[0]
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body)
def test_publish_falls_back_to_local_counters_when_metrics_read_fails(self) -> None: def test_publish_falls_back_to_local_counters_when_metrics_read_fails(self) -> None:
_GatewayHandler.fail_metrics_read = True _GatewayHandler.fail_metrics_read = True
with mock.patch.dict("os.environ", self._env_for_gate_status(0)): exit_code = publisher.main(
exit_code = publisher.main( [
[ "--pushgateway-url",
"--pushgateway-url", self.base_url,
self.base_url, "--job-name",
"--job-name", "platform-quality-ci",
"platform-quality-ci", "--suite",
"--suite", "ananke",
"ananke", "--local-ok",
"--local-ok", "11",
"11", "--local-failed",
"--local-failed", "3",
"3", ]
] )
)
self.assertEqual(exit_code, 0) self.assertEqual(exit_code, 0)
self.assertEqual(len(_GatewayHandler.posts), 1) self.assertEqual(len(_GatewayHandler.posts), 1)
@ -176,7 +119,6 @@ class PublishQualityMetricsTest(unittest.TestCase):
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 11', body) self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 11', body)
self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 3', body) self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 3', body)
self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body) self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body) self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)

View File

@ -158,9 +158,15 @@ mkdir -p "${BUILD_DIR}"
rm -f "${COVERAGE_PROFILE}" "${COVERAGE_PERCENT_FILE}" rm -f "${COVERAGE_PROFILE}" "${COVERAGE_PERCENT_FILE}"
printf 'failed\n' > "${BUILD_DIR}/docs-naming.status" printf 'failed\n' > "${BUILD_DIR}/docs-naming.status"
echo "[quality] dependency download" echo "[quality] unit tests + workspace coverage profile"
export GOPROXY="${GOPROXY:-https://proxy.golang.org,direct}" export GOPROXY="${GOPROXY:-https://proxy.golang.org,direct}"
run_with_retry 4 go mod download run_with_retry 4 go mod download
run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
if [[ -z "${coverage_percent}" ]]; then
coverage_percent="0"
fi
printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
echo "[quality] hygiene: doc contracts" echo "[quality] hygiene: doc contracts"
cd testing cd testing
@ -183,14 +189,6 @@ echo "[quality] lint"
echo "[quality] installer template contracts" echo "[quality] installer template contracts"
./scripts/verify_install_templates.sh ./scripts/verify_install_templates.sh
echo "[quality] unit tests + workspace coverage profile"
run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
if [[ -z "${coverage_percent}" ]]; then
coverage_percent="0"
fi
printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
echo "[quality] per-file coverage gate (95%)" echo "[quality] per-file coverage gate (95%)"
cd testing cd testing
ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v

View File

@ -17,12 +17,6 @@ import (
const maxGoFileLOC = 500 const maxGoFileLOC = 500
var goFileNamePattern = regexp.MustCompile(`^[a-z0-9]+(_[a-z0-9]+)*(_test)?\.go$`) var goFileNamePattern = regexp.MustCompile(`^[a-z0-9]+(_[a-z0-9]+)*(_test)?\.go$`)
var genericFileNameTokens = map[string]struct{}{
"chunk": {},
"part": {},
"piece": {},
"split": {},
}
func repoRoot(tb testing.TB) string { func repoRoot(tb testing.TB) string {
tb.Helper() tb.Helper()
@ -67,16 +61,13 @@ func collectGoFiles(tb testing.TB, roots ...string) []string {
func TestHygieneContracts(t *testing.T) { func TestHygieneContracts(t *testing.T) {
root := repoRoot(t) root := repoRoot(t)
files := collectGoFiles(t, filepath.Join(root, "cmd"), filepath.Join(root, "internal")) files := collectGoFiles(t, filepath.Join(root, "cmd"), filepath.Join(root, "internal"))
namingFiles := append([]string{}, files...)
namingFiles = append(namingFiles, collectGoFiles(t, filepath.Join(root, "testing"))...)
sort.Strings(files) sort.Strings(files)
sort.Strings(namingFiles)
t.Run("doc_contract", func(t *testing.T) { t.Run("doc_contract", func(t *testing.T) {
checkDocContracts(t, files) checkDocContracts(t, files)
}) })
t.Run("naming_contract", func(t *testing.T) { t.Run("naming_contract", func(t *testing.T) {
checkNamingContracts(t, namingFiles) checkNamingContracts(t, files)
}) })
t.Run("loc_limit", func(t *testing.T) { t.Run("loc_limit", func(t *testing.T) {
checkFileLOCLimits(t, files) checkFileLOCLimits(t, files)
@ -130,19 +121,9 @@ func checkNamingContracts(t *testing.T, files []string) {
if !goFileNamePattern.MatchString(base) { if !goFileNamePattern.MatchString(base) {
t.Errorf("%s: filename %q violates naming contract %s", file, base, goFileNamePattern.String()) t.Errorf("%s: filename %q violates naming contract %s", file, base, goFileNamePattern.String())
} }
for _, token := range filenameTokens(base) {
if _, ok := genericFileNameTokens[token]; ok {
t.Errorf("%s: filename %q uses generic split-file token %q", file, base, token)
}
}
} }
} }
func filenameTokens(name string) []string {
trimmed := strings.TrimSuffix(strings.TrimSuffix(name, ".go"), "_test")
return strings.Split(trimmed, "_")
}
// checkFileLOCLimits runs one orchestration or CLI step. // checkFileLOCLimits runs one orchestration or CLI step.
// Signature: checkFileLOCLimits(t *testing.T, files []string). // Signature: checkFileLOCLimits(t *testing.T, files []string).
// Why: A strict LOC cap forces focused files and keeps refactors manageable. // Why: A strict LOC cap forces focused files and keeps refactors manageable.

View File

@ -13,8 +13,6 @@ cmd/ananke/power_safety_test.go
cmd/ananke/test_helpers_test.go cmd/ananke/test_helpers_test.go
internal/cluster/orchestrator_inventory_test.go internal/cluster/orchestrator_inventory_test.go
internal/cluster/orchestrator_report_test.go internal/cluster/orchestrator_report_test.go
internal/cluster/orchestrator_autorepair_test.go
internal/cluster/orchestrator_autorepair_cleanup_test.go
internal/cluster/orchestrator_test.go internal/cluster/orchestrator_test.go
internal/cluster/orchestrator_unit_additional_test.go internal/cluster/orchestrator_unit_additional_test.go
internal/cluster/orchestrator_vault_test.go internal/cluster/orchestrator_vault_test.go
@ -23,7 +21,6 @@ internal/config/load_additional_test.go
internal/config/validate_matrix_test.go internal/config/validate_matrix_test.go
internal/service/daemon_additional_test.go internal/service/daemon_additional_test.go
internal/service/daemon_coverage_closeout_test.go internal/service/daemon_coverage_closeout_test.go
internal/service/daemon_poststart_autorepair_test.go
internal/service/daemon_quality_branches_test.go internal/service/daemon_quality_branches_test.go
internal/service/daemon_test.go internal/service/daemon_test.go
internal/sshutil/repair_test.go internal/sshutil/repair_test.go

View File

@ -363,3 +363,4 @@ func TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T) {
} }
}) })
} }

View File

@ -79,29 +79,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
} }
}) })
t.Run("flux-health-required-kustomizations-ignore-optional-failures", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/monitoring"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
return `{"items":[
{"metadata":{"namespace":"flux-system","name":"monitoring"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}},
{"metadata":{"namespace":"flux-system","name":"jellyfin"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"optional still down"}]}}
]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
ready, detail, err := orch.TestHookFluxHealthReady(context.Background())
if err != nil || !ready || !strings.Contains(detail, "required kustomizations ready=") {
t.Fatalf("expected required flux kustomization scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
}
})
t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) { t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
cfg.Startup.WorkloadConvergenceWaitSeconds = 1 cfg.Startup.WorkloadConvergenceWaitSeconds = 1
@ -168,42 +145,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
} }
}) })
t.Run("required-workload-namespaces-ignore-optional-failure-pods", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
cfg.Startup.StuckPodGraceSeconds = 1
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
return `{"items":[
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
{"kind":"Deployment","metadata":{"namespace":"jellyfin","name":"pegasus"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[
{"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"grafana"}]},"status":{"containerStatuses":[{"state":{"running":{}}}]}},
{"metadata":{"namespace":"jellyfin","name":"pegasus-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"pegasus"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"pegasus"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}
]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
ready, detail, err := orch.TestHookWorkloadConvergenceReady(context.Background())
if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
t.Fatalf("expected required workload namespace scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
}
failures, err := orch.TestHookStartupFailurePods(context.Background())
if err != nil {
t.Fatalf("startup failure pod query: %v", err)
}
if len(failures) != 0 {
t.Fatalf("expected optional namespace failures to be ignored, got %v", failures)
}
})
t.Run("critical-workload-replica-heal-branches", func(t *testing.T) { t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {

View File

@ -19,11 +19,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
) )
// newHookOrchestratorWithRunnerMode runs one orchestration or CLI step. // newHookOrchestratorAdvanced runs one orchestration or CLI step.
// Signature: newHookOrchestratorWithRunnerMode(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder). // Signature: newHookOrchestratorAdvanced(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
// Why: these scenarios needs dry-run and non-dry-run variants while keeping // Why: this part10 matrix needs dry-run and non-dry-run variants while keeping
// command dispatch deterministic from the top-level testing module. // command dispatch deterministic from the top-level testing module.
func newHookOrchestratorWithRunnerMode( func newHookOrchestratorAdvanced(
t *testing.T, t *testing.T,
cfg config.Config, cfg config.Config,
dryRun bool, dryRun bool,
@ -49,11 +49,11 @@ func newHookOrchestratorWithRunnerMode(
return orch, recorder return orch, recorder
} }
// TestHookVaultLifecycleBranchMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart10LowFileClosure runs one orchestration or CLI step.
// Signature: TestHookVaultLifecycleBranchMatrix(t *testing.T). // Signature: TestHookGapMatrixPart10LowFileClosure(t *testing.T).
// Why: closes remaining branch gaps on low-coverage orchestrator files using // Why: closes remaining branch gaps on low-coverage orchestrator files using
// targeted hook-level scenarios instead of brittle full-drill reruns. // targeted hook-level scenarios instead of brittle full-drill reruns.
func TestHookVaultLifecycleBranchMatrix(t *testing.T) { func TestHookGapMatrixPart10LowFileClosure(t *testing.T) {
t.Run("critical-vault-low-branches", func(t *testing.T) { t.Run("critical-vault-low-branches", func(t *testing.T) {
t.Run("vault-sealed-parse-error", func(t *testing.T) { t.Run("vault-sealed-parse-error", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
@ -64,7 +64,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
if _, err := orch.TestHookVaultSealed(context.Background()); err == nil || !strings.Contains(err.Error(), "parse vault status") { if _, err := orch.TestHookVaultSealed(context.Background()); err == nil || !strings.Contains(err.Error(), "parse vault status") {
t.Fatalf("expected vault status parse error branch, got %v", err) t.Fatalf("expected vault status parse error branch, got %v", err)
} }
@ -81,7 +81,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
if _, err := orch.TestHookVaultUnsealKey(context.Background()); err == nil || !strings.Contains(err.Error(), "vault-init unseal key is empty") { if _, err := orch.TestHookVaultUnsealKey(context.Background()); err == nil || !strings.Contains(err.Error(), "vault-init unseal key is empty") {
t.Fatalf("expected empty decoded unseal key branch, got %v", err) t.Fatalf("expected empty decoded unseal key branch, got %v", err)
} }
@ -90,7 +90,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
t.Run("write-unseal-key-file-write-error", func(t *testing.T) { t.Run("write-unseal-key-file-write-error", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
cfg.Startup.VaultUnsealKeyFile = t.TempDir() cfg.Startup.VaultUnsealKeyFile = t.TempDir()
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
if err := orch.TestHookWriteVaultUnsealKeyFile("vault-key"); err == nil || !strings.Contains(err.Error(), "write vault unseal key file") { if err := orch.TestHookWriteVaultUnsealKeyFile("vault-key"); err == nil || !strings.Contains(err.Error(), "write vault unseal key file") {
t.Fatalf("expected write failure branch when key path is a directory, got %v", err) t.Fatalf("expected write failure branch when key path is a directory, got %v", err)
} }
@ -105,7 +105,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orchNoValue, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runNoValue, runNoValue) orchNoValue, _ := newHookOrchestratorAdvanced(t, cfg, false, runNoValue, runNoValue)
ready, err := orchNoValue.TestHookWorkloadReady(context.Background(), "vault", "statefulset", "vault") ready, err := orchNoValue.TestHookWorkloadReady(context.Background(), "vault", "statefulset", "vault")
if err != nil || ready { if err != nil || ready {
t.Fatalf("expected no-value readiness branch, ready=%v err=%v", ready, err) t.Fatalf("expected no-value readiness branch, ready=%v err=%v", ready, err)
@ -124,7 +124,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
} }
orchEnsureErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runEnsureErr, runEnsureErr) orchEnsureErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runEnsureErr, runEnsureErr)
if err := orchEnsureErr.TestHookEnsureCriticalStartupWorkloads(context.Background()); err == nil || !strings.Contains(err.Error(), "rollout failed") { if err := orchEnsureErr.TestHookEnsureCriticalStartupWorkloads(context.Background()); err == nil || !strings.Contains(err.Error(), "rollout failed") {
t.Fatalf("expected ensureCriticalStartupWorkloads wait error branch, got %v", err) t.Fatalf("expected ensureCriticalStartupWorkloads wait error branch, got %v", err)
} }
@ -139,7 +139,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orchPhase, _ := newHookOrchestratorWithRunnerMode(t, cfgPhase, false, runPhase, runPhase) orchPhase, _ := newHookOrchestratorAdvanced(t, cfgPhase, false, runPhase, runPhase)
if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") { if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") {
t.Fatalf("expected pod phase guard branch, got %v", err) t.Fatalf("expected pod phase guard branch, got %v", err)
} }
@ -170,7 +170,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return runFollowup(ctx, timeout, name, args...) return runFollowup(ctx, timeout, name, args...)
} }
orchFollowup, _ := newHookOrchestratorWithRunnerMode(t, cfgFollowup, false, runFollowup, runSensitive) orchFollowup, _ := newHookOrchestratorAdvanced(t, cfgFollowup, false, runFollowup, runSensitive)
if err := orchFollowup.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "vault status check failed") { if err := orchFollowup.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "vault status check failed") {
t.Fatalf("expected follow-up sealed status error branch, got %v", err) t.Fatalf("expected follow-up sealed status error branch, got %v", err)
} }
@ -204,7 +204,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
} }
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
err := orch.TestHookDrainWorkers(context.Background(), workers) err := orch.TestHookDrainWorkers(context.Background(), workers)
if err == nil || !strings.Contains(err.Error(), "drain workers had 5 errors") { if err == nil || !strings.Contains(err.Error(), "drain workers had 5 errors") {
t.Fatalf("expected drain aggregation branch, got %v", err) t.Fatalf("expected drain aggregation branch, got %v", err)
@ -217,7 +217,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
cfg.SSHManagedNodes = []string{"titan-db"} cfg.SSHManagedNodes = []string{"titan-db"}
rec := &commandRecorder{} rec := &commandRecorder{}
base := lifecycleDispatcher(rec) base := lifecycleDispatcher(rec)
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
orch.TestHookRunSSHAcrossNodes(context.Background(), []string{"titan-db", "not-managed"}, "noop", "echo ok") orch.TestHookRunSSHAcrossNodes(context.Background(), []string{"titan-db", "not-managed"}, "noop", "echo ok")
if !rec.contains("atlas@titan-db echo ok") { if !rec.contains("atlas@titan-db echo ok") {
t.Fatalf("expected managed ssh execution branch") t.Fatalf("expected managed ssh execution branch")
@ -233,7 +233,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
if _, err := orch.TestHookLatestEtcdSnapshotPath(context.Background(), "titan-db"); err == nil || !strings.Contains(err.Error(), "no etcd snapshots found") { if _, err := orch.TestHookLatestEtcdSnapshotPath(context.Background(), "titan-db"); err == nil || !strings.Contains(err.Error(), "no etcd snapshots found") {
t.Fatalf("expected empty snapshot-list branch, got %v", err) t.Fatalf("expected empty snapshot-list branch, got %v", err)
} }
@ -250,7 +250,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orchWorkers, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runWorkers, runWorkers) orchWorkers, _ := newHookOrchestratorAdvanced(t, cfg, false, runWorkers, runWorkers)
workers, err := orchWorkers.TestHookEffectiveWorkers(context.Background()) workers, err := orchWorkers.TestHookEffectiveWorkers(context.Background())
if err != nil || len(workers) == 0 { if err != nil || len(workers) == 0 {
t.Fatalf("expected inventory worker fallback branch, workers=%v err=%v", workers, err) t.Fatalf("expected inventory worker fallback branch, workers=%v err=%v", workers, err)
@ -273,7 +273,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
} }
orchWrite, _ := newHookOrchestratorWithRunnerMode(t, cfgWrite, false, runWrite, runWrite) orchWrite, _ := newHookOrchestratorAdvanced(t, cfgWrite, false, runWrite, runWrite)
if err := orchWrite.TestHookScaleDownApps(context.Background()); err == nil || !strings.Contains(err.Error(), "write scaled workload snapshot") { if err := orchWrite.TestHookScaleDownApps(context.Background()); err == nil || !strings.Contains(err.Error(), "write scaled workload snapshot") {
t.Fatalf("expected scaled snapshot write-failure branch, got %v", err) t.Fatalf("expected scaled snapshot write-failure branch, got %v", err)
} }
@ -294,7 +294,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
} }
orchReady, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runReady, runReady) orchReady, _ := newHookOrchestratorAdvanced(t, cfg, false, runReady, runReady)
ready, detail, err := orchReady.TestHookFluxHealthReady(context.Background()) ready, detail, err := orchReady.TestHookFluxHealthReady(context.Background())
if err != nil || ready || !strings.Contains(detail, "ready=false") { if err != nil || ready || !strings.Contains(detail, "ready=false") {
t.Fatalf("expected flux ready-reason fallback branch, ready=%v detail=%q err=%v", ready, detail, err) t.Fatalf("expected flux ready-reason fallback branch, ready=%v detail=%q err=%v", ready, detail, err)
@ -319,7 +319,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
} }
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
cancel() cancel()
if err := orch.TestHookWaitForFluxHealth(ctx); !errors.Is(err, context.Canceled) { if err := orch.TestHookWaitForFluxHealth(ctx); !errors.Is(err, context.Canceled) {
@ -336,7 +336,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
rec := &commandRecorder{} rec := &commandRecorder{}
base := lifecycleDispatcher(rec) base := lifecycleDispatcher(rec)
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil { if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
t.Fatalf("expected ensureRequiredNodeLabels skip/apply branches, got %v", err) t.Fatalf("expected ensureRequiredNodeLabels skip/apply branches, got %v", err)
} }
@ -347,7 +347,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
t.Run("wait-for-startup-convergence-dryrun-and-critical-endpoint-fail", func(t *testing.T) { t.Run("wait-for-startup-convergence-dryrun-and-critical-endpoint-fail", func(t *testing.T) {
cfgDry := lifecycleConfig(t) cfgDry := lifecycleConfig(t)
orchDry, _ := newHookOrchestratorWithRunnerMode(t, cfgDry, true, nil, nil) orchDry, _ := newHookOrchestratorAdvanced(t, cfgDry, true, nil, nil)
if err := orchDry.TestHookWaitForStartupConvergence(context.Background()); err != nil { if err := orchDry.TestHookWaitForStartupConvergence(context.Background()); err != nil {
t.Fatalf("expected startup convergence dry-run fast path, got %v", err) t.Fatalf("expected startup convergence dry-run fast path, got %v", err)
} }
@ -365,7 +365,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orchFail, _ := newHookOrchestratorWithRunnerMode(t, cfgFail, false, run, run) orchFail, _ := newHookOrchestratorAdvanced(t, cfgFail, false, run, run)
if err := orchFail.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "query endpoints") { if err := orchFail.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "query endpoints") {
t.Fatalf("expected critical-endpoint convergence failure branch, got %v", err) t.Fatalf("expected critical-endpoint convergence failure branch, got %v", err)
} }
@ -373,7 +373,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
t.Run("ingress-namespace-discovery-empty-and-query-error", func(t *testing.T) { t.Run("ingress-namespace-discovery-empty-and-query-error", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
orchEmpty, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil) orchEmpty, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
namespaces, err := orchEmpty.TestHookDiscoverIngressNamespacesForHost(context.Background(), " ") namespaces, err := orchEmpty.TestHookDiscoverIngressNamespacesForHost(context.Background(), " ")
if err != nil || len(namespaces) != 0 { if err != nil || len(namespaces) != 0 {
t.Fatalf("expected empty-host fast path, namespaces=%v err=%v", namespaces, err) t.Fatalf("expected empty-host fast path, namespaces=%v err=%v", namespaces, err)
@ -386,7 +386,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orchErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runErr, runErr) orchErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runErr, runErr)
if _, err := orchErr.TestHookDiscoverIngressNamespacesForHost(context.Background(), "metrics.bstein.dev"); err == nil || !strings.Contains(err.Error(), "query ingresses") { if _, err := orchErr.TestHookDiscoverIngressNamespacesForHost(context.Background(), "metrics.bstein.dev"); err == nil || !strings.Contains(err.Error(), "query ingresses") {
t.Fatalf("expected ingress query error branch, got %v", err) t.Fatalf("expected ingress query error branch, got %v", err)
} }
@ -412,7 +412,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
URL: "http://" + listener.Addr().String() + "/health", URL: "http://" + listener.Addr().String() + "/health",
AcceptedStatuses: []int{200}, AcceptedStatuses: []int{200},
}} }}
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
ready, detail := orch.TestHookServiceChecklistReady(context.Background()) ready, detail := orch.TestHookServiceChecklistReady(context.Background())
if ready || !strings.Contains(detail, "http://") { if ready || !strings.Contains(detail, "http://") {
t.Fatalf("expected service checklist URL-name fallback failure, ready=%v detail=%q", ready, detail) t.Fatalf("expected service checklist URL-name fallback failure, ready=%v detail=%q", ready, detail)
@ -435,7 +435,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
cancel() cancel()
if err := orch.TestHookWaitForNodeInventoryReachability(ctx); !errors.Is(err, context.Canceled) { if err := orch.TestHookWaitForNodeInventoryReachability(ctx); !errors.Is(err, context.Canceled) {
@ -456,7 +456,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
cancel() cancel()
if err := orch.TestHookWaitForPostStartProbes(ctx); !errors.Is(err, context.Canceled) { if err := orch.TestHookWaitForPostStartProbes(ctx); !errors.Is(err, context.Canceled) {
@ -478,7 +478,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
if err := orch.TestHookResumeFluxAndReconcile(context.Background()); err != nil { if err := orch.TestHookResumeFluxAndReconcile(context.Background()); err != nil {
t.Fatalf("expected resume flux warning-only branch, got %v", err) t.Fatalf("expected resume flux warning-only branch, got %v", err)
} }
@ -505,7 +505,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
} }
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
cancel() cancel()
if err := orch.TestHookWaitForTimeSync(ctx, []string{"", "titan-db"}); !errors.Is(err, context.Canceled) { if err := orch.TestHookWaitForTimeSync(ctx, []string{"", "titan-db"}); !errors.Is(err, context.Canceled) {
@ -532,14 +532,14 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
} }
orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run) orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
if err := orch.TestHookWaitForWorkloadConvergence(context.Background()); err != nil { if err := orch.TestHookWaitForWorkloadConvergence(context.Background()); err != nil {
t.Fatalf("expected workload convergence default-branch success, got %v", err) t.Fatalf("expected workload convergence default-branch success, got %v", err)
} }
cfgIgnore := lifecycleConfig(t) cfgIgnore := lifecycleConfig(t)
cfgIgnore.Startup.AutoRecycleStuckPods = false cfgIgnore.Startup.AutoRecycleStuckPods = false
orchIgnoreDry, _ := newHookOrchestratorWithRunnerMode(t, cfgIgnore, true, run, run) orchIgnoreDry, _ := newHookOrchestratorAdvanced(t, cfgIgnore, true, run, run)
now := time.Now().UTC().Add(-time.Hour) now := time.Now().UTC().Add(-time.Hour)
orchIgnoreDry.TestHookMaybeAutoRecycleStuckPods(context.Background(), &now) orchIgnoreDry.TestHookMaybeAutoRecycleStuckPods(context.Background(), &now)
orchIgnoreDry.TestHookMaybeAutoHealCriticalWorkloadReplicas(context.Background(), &now) orchIgnoreDry.TestHookMaybeAutoHealCriticalWorkloadReplicas(context.Background(), &now)
@ -551,7 +551,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
} }
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
} }
orchHealErr, _ := newHookOrchestratorWithRunnerMode(t, lifecycleConfig(t), false, runHealErr, runHealErr) orchHealErr, _ := newHookOrchestratorAdvanced(t, lifecycleConfig(t), false, runHealErr, runHealErr)
if _, err := orchHealErr.TestHookHealCriticalWorkloadReplicas(context.Background()); err == nil || !strings.Contains(err.Error(), "query workloads") { if _, err := orchHealErr.TestHookHealCriticalWorkloadReplicas(context.Background()); err == nil || !strings.Contains(err.Error(), "query workloads") {
t.Fatalf("expected critical workload heal query-error branch, got %v", err) t.Fatalf("expected critical workload heal query-error branch, got %v", err)
} }

View File

@ -20,7 +20,7 @@ import (
// newLifecycleMatrixOrchestrator runs one orchestration or CLI step. // newLifecycleMatrixOrchestrator runs one orchestration or CLI step.
// Signature: newLifecycleMatrixOrchestrator(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride, kubeconfig string) *cluster.Orchestrator. // Signature: newLifecycleMatrixOrchestrator(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride, kubeconfig string) *cluster.Orchestrator.
// Why: lifecycle cleanup scenarios need direct control over runner dry-run and kubeconfig branches. // Why: part11 needs direct control over runner dry-run and kubeconfig branches.
func newLifecycleMatrixOrchestrator( func newLifecycleMatrixOrchestrator(
t *testing.T, t *testing.T,
cfg config.Config, cfg config.Config,
@ -49,11 +49,11 @@ func newLifecycleMatrixOrchestrator(
return orch return orch
} }
// TestHookLifecycleCleanupRemainingClosure runs one orchestration or CLI step. // TestHookGapMatrixPart11RemainingClosure runs one orchestration or CLI step.
// Signature: TestHookLifecycleCleanupRemainingClosure(t *testing.T). // Signature: TestHookGapMatrixPart11RemainingClosure(t *testing.T).
// Why: closes final branch gaps for lifecycle + remaining near-threshold // Why: closes final branch gaps for lifecycle + remaining near-threshold
// orchestrator files so per-file coverage reaches the enforced 95% target. // orchestrator files so per-file coverage reaches the enforced 95% target.
func TestHookLifecycleCleanupRemainingClosure(t *testing.T) { func TestHookGapMatrixPart11RemainingClosure(t *testing.T) {
t.Run("critical-vault-final-closures", func(t *testing.T) { t.Run("critical-vault-final-closures", func(t *testing.T) {
t.Run("ensure-critical-cleanup-error-and-cleanup-branches", func(t *testing.T) { t.Run("ensure-critical-cleanup-error-and-cleanup-branches", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
@ -633,7 +633,7 @@ func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
switch { switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
apiVersionCalls++ apiVersionCalls++
if apiVersionCalls <= 2 { if apiVersionCalls == 1 {
return "", errors.New("api down") return "", errors.New("api down")
} }
return "v1.31.0", nil return "v1.31.0", nil

View File

@ -17,11 +17,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
) )
// TestHookTimesyncAndStabilityMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart2TimesyncAndStability runs one orchestration or CLI step.
// Signature: TestHookTimesyncAndStabilityMatrix(t *testing.T). // Signature: TestHookGapMatrixPart2TimesyncAndStability(t *testing.T).
// Why: drives low-coverage time-sync, datastore parsing, and startup stability // Why: drives low-coverage time-sync, datastore parsing, and startup stability
// branches from the top-level testing module. // branches from the top-level testing module.
func TestHookTimesyncAndStabilityMatrix(t *testing.T) { func TestHookGapMatrixPart2TimesyncAndStability(t *testing.T) {
t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) { t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) {
cases := []struct { cases := []struct {
line string line string
@ -162,11 +162,11 @@ func TestHookTimesyncAndStabilityMatrix(t *testing.T) {
}) })
} }
// TestHookFluxScalingReportMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart2FluxScalingReport runs one orchestration or CLI step.
// Signature: TestHookFluxScalingReportMatrix(t *testing.T). // Signature: TestHookGapMatrixPart2FluxScalingReport(t *testing.T).
// Why: targets low branch density in flux-health, scaling snapshot handling, // Why: targets low branch density in flux-health, scaling snapshot handling,
// and report sanitization helpers. // and report sanitization helpers.
func TestHookFluxScalingReportMatrix(t *testing.T) { func TestHookGapMatrixPart2FluxScalingReport(t *testing.T) {
t.Run("flux-helper-matrix", func(t *testing.T) { t.Run("flux-helper-matrix", func(t *testing.T) {
if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") { if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") {
t.Fatalf("expected immutable matcher true for uppercase+job variant") t.Fatalf("expected immutable matcher true for uppercase+job variant")
@ -241,11 +241,11 @@ func TestHookFluxScalingReportMatrix(t *testing.T) {
}) })
} }
// TestHookVaultAndCoordinationMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart2VaultAndCoordination runs one orchestration or CLI step.
// Signature: TestHookVaultAndCoordinationMatrix(t *testing.T). // Signature: TestHookGapMatrixPart2VaultAndCoordination(t *testing.T).
// Why: raises branch coverage on vault/key and coordination helpers without // Why: raises branch coverage on vault/key and coordination helpers without
// requiring package-local tests. // requiring package-local tests.
func TestHookVaultAndCoordinationMatrix(t *testing.T) { func TestHookGapMatrixPart2VaultAndCoordination(t *testing.T) {
t.Run("vault-unseal-and-file-branches", func(t *testing.T) { t.Run("vault-unseal-and-file-branches", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
cfg.Startup.VaultUnsealKeyFile = "" cfg.Startup.VaultUnsealKeyFile = ""
@ -296,11 +296,11 @@ func TestHookVaultAndCoordinationMatrix(t *testing.T) {
}) })
} }
// TestHookWorkloadIgnoreMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart2WorkloadIgnore runs one orchestration or CLI step.
// Signature: TestHookWorkloadIgnoreMatrix(t *testing.T). // Signature: TestHookGapMatrixPart2WorkloadIgnore(t *testing.T).
// Why: expands low branch coverage in workload ignore helpers and startup-failure // Why: expands low branch coverage in workload ignore helpers and startup-failure
// pod classification. // pod classification.
func TestHookWorkloadIgnoreMatrix(t *testing.T) { func TestHookGapMatrixPart2WorkloadIgnore(t *testing.T) {
t.Run("ignored-node-helper-matrix", func(t *testing.T) { t.Run("ignored-node-helper-matrix", func(t *testing.T) {
if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) { if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) {
t.Fatalf("expected selector-host ignored match") t.Fatalf("expected selector-host ignored match")

View File

@ -11,11 +11,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/config"
) )
// TestHookConvergenceAndStabilityMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart3ConvergenceAndStability runs one orchestration or CLI step.
// Signature: TestHookConvergenceAndStabilityMatrix(t *testing.T). // Signature: TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T).
// Why: raises coverage for startup convergence orchestration and stability gates // Why: raises coverage for startup convergence orchestration and stability gates
// that determine whether startup is considered truly complete. // that determine whether startup is considered truly complete.
func TestHookConvergenceAndStabilityMatrix(t *testing.T) { func TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T) {
t.Run("wait-for-startup-convergence-gate-matrix", func(t *testing.T) { t.Run("wait-for-startup-convergence-gate-matrix", func(t *testing.T) {
cfgIngress := lifecycleConfig(t) cfgIngress := lifecycleConfig(t)
cfgIngress.Startup.RequireIngressChecklist = true cfgIngress.Startup.RequireIngressChecklist = true
@ -108,11 +108,11 @@ func TestHookConvergenceAndStabilityMatrix(t *testing.T) {
}) })
} }
// TestHookLifecycleRestoreShutdownMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart3LifecycleRestoreShutdown runs one orchestration or CLI step.
// Signature: TestHookLifecycleRestoreShutdownMatrix(t *testing.T). // Signature: TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T).
// Why: fills lifecycle restore/shutdown success paths that are easy to miss in // Why: fills lifecycle restore/shutdown success paths that are easy to miss in
// failure-focused drill tests. // failure-focused drill tests.
func TestHookLifecycleRestoreShutdownMatrix(t *testing.T) { func TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T) {
t.Run("etcd-restore-dry-run-and-success", func(t *testing.T) { t.Run("etcd-restore-dry-run-and-success", func(t *testing.T) {
cfgDry := lifecycleConfig(t) cfgDry := lifecycleConfig(t)
dry := newDryRunHookOrchestrator(t, cfgDry, nil) dry := newDryRunHookOrchestrator(t, cfgDry, nil)

View File

@ -19,11 +19,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
) )
// TestHookCoordinationAndReachabilityMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart4CoordinationAndReachability runs one orchestration or CLI step.
// Signature: TestHookCoordinationAndReachabilityMatrix(t *testing.T). // Signature: TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T).
// Why: closes remaining coordination/reachability low branches with deterministic // Why: closes remaining coordination/reachability low branches with deterministic
// command responses and short timeouts. // command responses and short timeouts.
func TestHookCoordinationAndReachabilityMatrix(t *testing.T) { func TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T) {
t.Run("peer-shutdown-complete-cooldown-blocks-startup", func(t *testing.T) { t.Run("peer-shutdown-complete-cooldown-blocks-startup", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
cfg.Coordination.PeerHosts = []string{"titan-24"} cfg.Coordination.PeerHosts = []string{"titan-24"}
@ -136,11 +136,11 @@ func TestHookCoordinationAndReachabilityMatrix(t *testing.T) {
}) })
} }
// TestHookIngressServiceAndPostStartMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart4IngressServiceAndPostStart runs one orchestration or CLI step.
// Signature: TestHookIngressServiceAndPostStartMatrix(t *testing.T). // Signature: TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T).
// Why: drives ingress/service checklist and post-start branches that were still // Why: drives ingress/service checklist and post-start branches that were still
// under-covered after drill-focused matrix tests. // under-covered after drill-focused matrix tests.
func TestHookIngressServiceAndPostStartMatrix(t *testing.T) { func TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T) {
t.Run("ingress-backend-autoscale-cooldown-and-host-parser", func(t *testing.T) { t.Run("ingress-backend-autoscale-cooldown-and-host-parser", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{ cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
@ -194,11 +194,11 @@ func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
orch, _ := newHookOrchestrator(t, cfg, nil, nil) orch, _ := newHookOrchestrator(t, cfg, nil, nil)
ok, detail := orch.TestHookServiceCheckReady(context.Background(), config.ServiceChecklistCheck{ ok, detail := orch.TestHookServiceCheckReady(context.Background(), config.ServiceChecklistCheck{
Name: "forbidden-marker", Name: "forbidden-marker",
URL: srv.URL, URL: srv.URL,
AcceptedStatuses: []int{200}, AcceptedStatuses: []int{200},
BodyNotContains: "marker", BodyNotContains: "marker",
TimeoutSeconds: 2, TimeoutSeconds: 2,
}) })
if ok || !strings.Contains(detail, "forbidden marker") { if ok || !strings.Contains(detail, "forbidden marker") {
t.Fatalf("expected forbidden-marker branch, ok=%v detail=%q", ok, detail) t.Fatalf("expected forbidden-marker branch, ok=%v detail=%q", ok, detail)
@ -233,11 +233,11 @@ func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
}) })
} }
// TestHookReportScalingStorageDrainMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart4ReportScalingStorageDrain runs one orchestration or CLI step.
// Signature: TestHookReportScalingStorageDrainMatrix(t *testing.T). // Signature: TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T).
// Why: covers artifact, scaling snapshot, storage, and drain error branches that // Why: covers artifact, scaling snapshot, storage, and drain error branches that
// are difficult to hit from happy-path lifecycle drills. // are difficult to hit from happy-path lifecycle drills.
func TestHookReportScalingStorageDrainMatrix(t *testing.T) { func TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T) {
t.Run("report-artifact-and-progress-error-paths", func(t *testing.T) { t.Run("report-artifact-and-progress-error-paths", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
reportsFile := filepath.Join(t.TempDir(), "reports-as-file") reportsFile := filepath.Join(t.TempDir(), "reports-as-file")
@ -339,11 +339,11 @@ func TestHookReportScalingStorageDrainMatrix(t *testing.T) {
}) })
} }
// TestHookTimesyncLifecycleAndAccessMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart4TimesyncLifecycleAndAccess runs one orchestration or CLI step.
// Signature: TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T). // Signature: TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T).
// Why: closes remaining timing/access/lifecycle branches that still sat below // Why: closes remaining timing/access/lifecycle branches that still sat below
// target after the earlier matrices. // target after the earlier matrices.
func TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T) { func TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T) {
t.Run("timesync-quorum-and-strict-failure-branches", func(t *testing.T) { t.Run("timesync-quorum-and-strict-failure-branches", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
cfg.Startup.TimeSyncMode = "quorum" cfg.Startup.TimeSyncMode = "quorum"

View File

@ -20,11 +20,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
) )
// TestHookEndpointHealingCoverageClosure runs one orchestration or CLI step. // TestHookGapMatrixPart5CoverageClosure runs one orchestration or CLI step.
// Signature: TestHookEndpointHealingCoverageClosure(t *testing.T). // Signature: TestHookGapMatrixPart5CoverageClosure(t *testing.T).
// Why: closes branch gaps that still remained after drill-style tests by driving // Why: closes branch gaps that still remained after drill-style tests by driving
// low-coverage orchestrator internals through the exported top-level hook surface. // low-coverage orchestrator internals through the exported top-level hook surface.
func TestHookEndpointHealingCoverageClosure(t *testing.T) { func TestHookGapMatrixPart5CoverageClosure(t *testing.T) {
t.Run("critical-endpoint-backend-heal-matrix", func(t *testing.T) { t.Run("critical-endpoint-backend-heal-matrix", func(t *testing.T) {
t.Run("empty-namespace-service-noop", func(t *testing.T) { t.Run("empty-namespace-service-noop", func(t *testing.T) {
orch, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil) orch, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
@ -491,10 +491,10 @@ func httpStatusHandler(code int, body string) func(http.ResponseWriter, *http.Re
} }
} }
// TestHookIngressHostMappingRegression runs one orchestration or CLI step. // TestHookGapMatrixPart5IngressHostMappingRegression runs one orchestration or CLI step.
// Signature: TestHookIngressHostMappingRegression(t *testing.T). // Signature: TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T).
// Why: ensures host parsing fallback paths stay stable for ingress/service checklist failures. // Why: ensures host parsing fallback paths stay stable for ingress/service checklist failures.
func TestHookIngressHostMappingRegression(t *testing.T) { func TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{ cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
{Name: "metrics", URL: "https://metrics.bstein.dev/api/health"}, {Name: "metrics", URL: "https://metrics.bstein.dev/api/health"},

View File

@ -16,11 +16,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
) )
// TestHookVaultPostStartBranchMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart6CoverageClosure runs one orchestration or CLI step.
// Signature: TestHookVaultPostStartBranchMatrix(t *testing.T). // Signature: TestHookGapMatrixPart6CoverageClosure(t *testing.T).
// Why: targets the remaining low branch paths after endpoint-healing coverage so per-file coverage // Why: targets the remaining low branch paths after part5 so per-file coverage
// can move toward the strict 95% quality gate. // can move toward the strict 95% quality gate.
func TestHookVaultPostStartBranchMatrix(t *testing.T) { func TestHookGapMatrixPart6CoverageClosure(t *testing.T) {
t.Run("critical-vault-and-poststart-branches", func(t *testing.T) { t.Run("critical-vault-and-poststart-branches", func(t *testing.T) {
t.Run("wait-vault-ready-dryrun-and-cancel", func(t *testing.T) { t.Run("wait-vault-ready-dryrun-and-cancel", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)

View File

@ -14,11 +14,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
) )
// TestHookWorkloadStorageAccessMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart7CoverageClosure runs one orchestration or CLI step.
// Signature: TestHookWorkloadStorageAccessMatrix(t *testing.T). // Signature: TestHookGapMatrixPart7CoverageClosure(t *testing.T).
// Why: closes additional low-coverage branches in convergence, storage, access, // Why: closes additional low-coverage branches in convergence, storage, access,
// flux, lifecycle, and sensitive command wrappers. // flux, lifecycle, and sensitive command wrappers.
func TestHookWorkloadStorageAccessMatrix(t *testing.T) { func TestHookGapMatrixPart7CoverageClosure(t *testing.T) {
t.Run("workload-convergence-branch-matrix", func(t *testing.T) { t.Run("workload-convergence-branch-matrix", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
cfg.Startup.WorkloadConvergenceWaitSeconds = 1 cfg.Startup.WorkloadConvergenceWaitSeconds = 1
@ -165,32 +165,6 @@ func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err) t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
} }
}) })
t.Run("wait-for-node-gates-only-require-core-nodes", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequireNodeSSHAuth = true
cfg.Startup.NodeSSHAuthWaitSeconds = 1
cfg.Startup.NodeSSHAuthPollSeconds = 1
cfg.Startup.NodeInventoryReachWaitSeconds = 1
cfg.Startup.NodeInventoryReachPollSeconds = 1
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
cfg.Startup.NodeSSHAuthRequiredNodes = []string{"titan-db"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "__ANANKE_") {
return "", errors.New("no route to host")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
t.Fatalf("expected ssh-auth gate to ignore non-core worker failure, got %v", err)
}
if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err != nil {
t.Fatalf("expected inventory reachability gate to ignore non-core worker failure, got %v", err)
}
})
}) })
t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) { t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {

View File

@ -19,11 +19,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
) )
// TestHookAccessVaultLifecycleMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart8CoverageClosure runs one orchestration or CLI step.
// Signature: TestHookAccessVaultLifecycleMatrix(t *testing.T). // Signature: TestHookGapMatrixPart8CoverageClosure(t *testing.T).
// Why: closes additional low-coverage branches in access, vault, lifecycle, // Why: closes additional low-coverage branches in access, vault, lifecycle,
// ingress/service stability, and timesync/inventory orchestration paths. // ingress/service stability, and timesync/inventory orchestration paths.
func TestHookAccessVaultLifecycleMatrix(t *testing.T) { func TestHookGapMatrixPart8CoverageClosure(t *testing.T) {
t.Run("access-ssh-and-branch-guard-branches", func(t *testing.T) { t.Run("access-ssh-and-branch-guard-branches", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
cfg.Startup.RequireNodeSSHAuth = true cfg.Startup.RequireNodeSSHAuth = true
@ -331,11 +331,11 @@ func TestHookAccessVaultLifecycleMatrix(t *testing.T) {
}) })
} }
// TestHookLifecycleStartupAutoRestoreBranch runs one orchestration or CLI step. // TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
// Signature: TestHookLifecycleStartupAutoRestoreBranch(t *testing.T). // Signature: TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T).
// Why: covers Startup's API-failure->auto-restore retry path that is otherwise // Why: covers Startup's API-failure->auto-restore retry path that is otherwise
// hard to exercise in deterministic top-level tests. // hard to exercise in deterministic top-level tests.
func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) { func TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
cfg.Startup.EtcdRestoreControlPlane = "titan-db" cfg.Startup.EtcdRestoreControlPlane = "titan-db"
@ -384,7 +384,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
} }
} }
orch, _ := newHookOrchestrator(t, cfg, run, run) orch, _ := newHookOrchestrator(t, cfg, run, run)
err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lifecycle-auto-restore"}) err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "part8-auto-restore"})
if err != nil { if err != nil {
t.Fatalf("expected startup auto-restore path success, got %v", err) t.Fatalf("expected startup auto-restore path success, got %v", err)
} }
@ -394,7 +394,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
cfgBadMode := lifecycleConfig(t) cfgBadMode := lifecycleConfig(t)
orchBadMode, _ := newHookOrchestrator(t, cfgBadMode, nil, nil) orchBadMode, _ := newHookOrchestrator(t, cfgBadMode, nil, nil)
err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "lifecycle", Mode: "unknown-mode"}) err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "part8", Mode: "unknown-mode"})
if err == nil || !strings.Contains(err.Error(), "unsupported shutdown mode") { if err == nil || !strings.Contains(err.Error(), "unsupported shutdown mode") {
t.Fatalf("expected shutdown unsupported-mode branch, got %v", err) t.Fatalf("expected shutdown unsupported-mode branch, got %v", err)
} }

View File

@ -16,11 +16,11 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
) )
// TestHookAccessCoordinationEndpointsMatrix runs one orchestration or CLI step. // TestHookGapMatrixPart9AccessCoordinationEndpoints runs one orchestration or CLI step.
// Signature: TestHookAccessCoordinationEndpointsMatrix(t *testing.T). // Signature: TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T).
// Why: closes uncovered statement ranges in access/fluxsource, coordination, // Why: closes uncovered statement ranges in access/fluxsource, coordination,
// and critical-endpoint orchestration helpers. // and critical-endpoint orchestration helpers.
func TestHookAccessCoordinationEndpointsMatrix(t *testing.T) { func TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T) {
t.Run("access-fluxsource-uncovered-ranges", func(t *testing.T) { t.Run("access-fluxsource-uncovered-ranges", func(t *testing.T) {
cfg := lifecycleConfig(t) cfg := lifecycleConfig(t)
cfg.Shutdown.SSHParallelism = 0 cfg.Shutdown.SSHParallelism = 0

View File

@ -53,48 +53,6 @@ func TestHookIngressServiceMatrix(t *testing.T) {
} }
}) })
t.Run("required-node-labels-skip-ignored-unavailable-nodes", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
}
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-09"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
t.Fatalf("expected ignored unavailable node labels to be skipped, got %q", command)
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
t.Fatalf("expected ignored unavailable node label enforcement to be skipped, got %v", err)
}
})
t.Run("required-node-labels-skip-absent-non-core-nodes", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
}
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
return "", errors.New("Error from server (NotFound): nodes \"titan-09\" not found")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
t.Fatalf("expected absent non-core node label enforcement to be skipped, got %v", err)
}
})
t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) { t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK) w.WriteHeader(http.StatusOK)

View File

@ -4,6 +4,7 @@ import (
"context" "context"
"errors" "errors"
"net" "net"
"os"
"strings" "strings"
"testing" "testing"
"time" "time"
@ -124,25 +125,20 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) { t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {
cfg := lifecycleFastConfig(t) cfg := lifecycleFastConfig(t)
cfg.Startup.RequireNodeInventoryReach = false cfg.Startup.ShutdownCooldownSeconds = 1
cfg.Startup.ShutdownCooldownSeconds = 5 if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
reads := 0 State: state.IntentShutdownComplete,
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) { Reason: "recent",
if path != cfg.State.IntentPath { Source: "test",
return state.TestHookReadIntentDefault(path) UpdatedAt: time.Now().UTC(),
} }); err != nil {
reads++ t.Fatalf("seed cooldown intent: %v", err)
if reads == 1 { }
return state.Intent{ go func(intentPath string) {
State: state.IntentShutdownComplete, time.Sleep(150 * time.Millisecond)
Reason: "recent", _ = os.Remove(intentPath)
Source: "test", _ = os.Mkdir(intentPath, 0o755)
UpdatedAt: time.Now().UTC().Add(-4 * time.Second), }(cfg.State.IntentPath)
}, nil
}
return state.Intent{}, errors.New("forced reread failure")
})
t.Cleanup(restoreRead)
orch, _ := newHookOrchestrator(t, cfg, nil, nil) orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"}) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})
if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") { if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {
@ -152,30 +148,24 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) { t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {
cfg := lifecycleFastConfig(t) cfg := lifecycleFastConfig(t)
cfg.Startup.RequireNodeInventoryReach = false cfg.Startup.ShutdownCooldownSeconds = 1
cfg.Startup.ShutdownCooldownSeconds = 5 if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
reads := 0 State: state.IntentShutdownComplete,
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) { Reason: "recent",
if path != cfg.State.IntentPath { Source: "test",
return state.TestHookReadIntentDefault(path) UpdatedAt: time.Now().UTC(),
} }); err != nil {
reads++ t.Fatalf("seed cooldown intent: %v", err)
if reads == 1 { }
return state.Intent{ go func(intentPath string) {
State: state.IntentShutdownComplete, time.Sleep(150 * time.Millisecond)
Reason: "recent", _ = state.WriteIntent(intentPath, state.Intent{
Source: "test",
UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
}, nil
}
return state.Intent{
State: state.IntentShuttingDown, State: state.IntentShuttingDown,
Reason: "peer-shutdown", Reason: "peer-shutdown",
Source: "test", Source: "test",
UpdatedAt: time.Now().UTC(), UpdatedAt: time.Now().UTC(),
}, nil })
}) }(cfg.State.IntentPath)
t.Cleanup(restoreRead)
orch, _ := newHookOrchestrator(t, cfg, nil, nil) orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"}) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})
if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") { if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {

View File

@ -1,432 +0,0 @@
package orchestrator
import (
"context"
"errors"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
)
// TestHookSchedulingStormHelpers runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormHelpers(t *testing.T).
// Why: keeps scheduling-storm helper coverage in the split top-level testing module
// required by the repo hygiene contract.
func TestHookSchedulingStormHelpers(t *testing.T) {
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "ollama-rs", "Deployment", "ollama"); !ok || got != "ai/deployment/ollama" {
t.Fatalf("unexpected deployment owner resolution: got=%q ok=%v", got, ok)
}
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("storage", "StatefulSet", "nextcloud", "", ""); !ok || got != "storage/statefulset/nextcloud" {
t.Fatalf("unexpected statefulset owner resolution: got=%q ok=%v", got, ok)
}
if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "missing", "", ""); ok || got != "" {
t.Fatalf("expected missing replicaset owner lookup to fail, got=%q ok=%v", got, ok)
}
if got := cluster.TestHookEventObservationCount(3, 9); got != 9 {
t.Fatalf("expected series count to win, got %d", got)
}
if got := cluster.TestHookEventObservationCount(0, 0); got != 1 {
t.Fatalf("expected zero-count normalization to 1, got %d", got)
}
now := time.Now().UTC().Round(time.Second)
if got := cluster.TestHookEventLastObservedAt(now, now.Add(-time.Minute), now.Add(-2*time.Minute), now.Add(-3*time.Minute)); !got.Equal(now) {
t.Fatalf("expected series timestamp priority, got %s", got)
}
if got := cluster.TestHookEventLastObservedAt(time.Time{}, now, now.Add(-time.Minute), now.Add(-2*time.Minute)); !got.Equal(now) {
t.Fatalf("expected lastTimestamp fallback, got %s", got)
}
if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, now, now.Add(-time.Minute)); !got.Equal(now) {
t.Fatalf("expected eventTime fallback, got %s", got)
}
if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, time.Time{}, now); !got.Equal(now) {
t.Fatalf("expected creationTimestamp fallback, got %s", got)
}
}
// TestHookSchedulingStormQuarantine runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormQuarantine(t *testing.T).
// Why: verifies that only non-core workloads generating real scheduling storms
// are auto-quarantined, which prevents event/Kine churn from spiking control-plane CPU.
func TestHookSchedulingStormQuarantine(t *testing.T) {
now := time.Now().UTC().Format(time.RFC3339)
cfg := lifecycleConfig(t)
cfg.Startup.AutoQuarantineSchedulingStorms = true
cfg.Startup.SchedulingStormEventThreshold = 30
cfg.Startup.SchedulingStormWindowSeconds = 180
cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault"}
cfg.Startup.IgnoreWorkloadNamespaces = []string{"ignored-ns"}
cfg.Startup.IgnoreWorkloads = []string{"monitoring/deployment/ignore-me"}
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-22"}
scaledOllama := false
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[
{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"vault","name":"vault-0","ownerReferences":[{"kind":"StatefulSet","name":"vault"}]},"spec":{},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"ignored-ns","name":"skip-pod","ownerReferences":[{"kind":"ReplicaSet","name":"skip-rs"}]},"spec":{},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"monitoring","name":"ignore-me-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignore-me-rs"}]},"spec":{},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"monitoring","name":"ignored-node-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignored-node-rs"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"monitoring","name":"running-pod","ownerReferences":[{"kind":"ReplicaSet","name":"running-rs"}]},"spec":{},"status":{"phase":"Running"}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[
{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}},
{"metadata":{"namespace":"ignored-ns","name":"skip-rs","ownerReferences":[{"kind":"Deployment","name":"skip"}]}},
{"metadata":{"namespace":"monitoring","name":"ignore-me-rs","ownerReferences":[{"kind":"Deployment","name":"ignore-me"}]}},
{"metadata":{"namespace":"monitoring","name":"ignored-node-rs","ownerReferences":[{"kind":"Deployment","name":"ignored-node"}]}},
{"metadata":{"namespace":"monitoring","name":"running-rs","ownerReferences":[{"kind":"Deployment","name":"running"}]}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"vault","name":"vault-0"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ignored-ns","name":"skip-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignore-me-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignored-node-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"running-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"stale-pod"},"type":"Warning","reason":"FailedScheduling","count":99}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return `{"items":[
{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}},
{"kind":"StatefulSet","metadata":{"namespace":"vault","name":"vault"},"spec":{"replicas":1}},
{"kind":"Deployment","metadata":{"namespace":"ignored-ns","name":"skip"},"spec":{"replicas":1}},
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignore-me"},"spec":{"replicas":1}},
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignored-node"},"spec":{"replicas":1}},
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"running"},"spec":{"replicas":1}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
scaledOllama = true
return "", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
orch.TestHookBeginStartupReport("scheduling-storm")
defer orch.TestHookFinalizeStartupReport(nil)
if err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background()); err != nil {
t.Fatalf("quarantine scheduling storm workloads: %v", err)
}
if !scaledOllama {
t.Fatalf("expected ollama deployment to be scaled to zero")
}
progress := readStartupProgress(t, orch)
if !strings.Contains(progress, "ollama") {
t.Fatalf("expected startup progress to mention ollama quarantine, payload=%s", progress)
}
if strings.Contains(progress, "vault") || strings.Contains(progress, "ignore-me") || strings.Contains(progress, "ignored-node") {
t.Fatalf("expected only the non-core eligible workload to be quarantined, payload=%s", progress)
}
}
// TestHookSchedulingStormTriggerGuards runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormTriggerGuards(t *testing.T).
// Why: covers dry-run/disabled/rate-limit guards so the scheduler-storm auto-heal
// only activates when the cluster is actually suffering this exact failure mode.
func TestHookSchedulingStormTriggerGuards(t *testing.T) {
cfgDisabled := lifecycleConfig(t)
orchDisabled, _ := newHookOrchestrator(t, cfgDisabled, nil, nil)
lastAttempt := time.Time{}
orchDisabled.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
if !lastAttempt.IsZero() {
t.Fatalf("expected disabled scheduling-storm trigger to be skipped")
}
cfgDry := lifecycleConfig(t)
cfgDry.Startup.AutoQuarantineSchedulingStorms = true
orchDry := newDryRunHookOrchestrator(t, cfgDry, nil)
orchDry.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
if !lastAttempt.IsZero() {
t.Fatalf("expected dry-run scheduling-storm trigger to be skipped")
}
cfgRate := lifecycleConfig(t)
cfgRate.Startup.AutoQuarantineSchedulingStorms = true
cfgRate.Startup.SchedulingStormEventThreshold = 5
cfgRate.Startup.SchedulingStormWindowSeconds = 60
recorder := &commandRecorder{}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
recorder.record(name, args)
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return `{"items":[]}`, nil
default:
return lifecycleDispatcher(recorder)(ctx, timeout, name, args...)
}
}
orchRate, _ := newHookOrchestrator(t, cfgRate, run, run)
lastAttempt = time.Now()
orchRate.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
if recorder.contains("get pods -A -o json") {
t.Fatalf("expected rate-limited scheduling-storm trigger to skip kubectl scans")
}
}
// TestHookSchedulingStormTriggerAndNoOpBranches runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T).
// Why: raises scheduling-storm branch coverage on the success/no-op paths so the
// auto-heal only acts on genuine event storms and stays quiet otherwise.
func TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.AutoQuarantineSchedulingStorms = true
cfg.Startup.SchedulingStormEventThreshold = 0
cfg.Startup.SchedulingStormWindowSeconds = 0
scanRan := false
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
scanRan = true
return `{"items":[
{"metadata":{"namespace":"","name":"missing"}},
{"metadata":{"namespace":"monitoring","name":"no-owner"},"spec":{},"status":{"phase":"Pending"}},
{"metadata":{"namespace":"monitoring","name":"done","ownerReferences":[{"kind":"ReplicaSet","name":"done-rs"}]},"spec":{},"status":{"phase":"Running"}},
{"metadata":{"namespace":"monitoring","name":"zero-replicas","ownerReferences":[{"kind":"ReplicaSet","name":"zero-rs"}]},"spec":{},"status":{"phase":"Pending"}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[
{"metadata":{"namespace":"","name":"bad-rs"}},
{"metadata":{"namespace":"monitoring","name":"done-rs","ownerReferences":[{"kind":"","name":"ignored"}]}},
{"metadata":{"namespace":"monitoring","name":"zero-rs","ownerReferences":[{"kind":"Deployment","name":"zero"}]}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"normal"},"type":"Normal","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"wrong-reason"},"type":"Warning","reason":"SomeOtherReason","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Service","namespace":"monitoring","name":"wrong-kind"},"type":"Warning","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"old"},"type":"Warning","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"low-count"},"type":"Warning","reason":"FailedScheduling","count":1},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"missing-pod"},"type":"Warning","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"done"},"type":"Warning","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"no-owner"},"type":"Warning","reason":"FailedScheduling","count":99},
{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"zero-replicas"},"type":"Warning","reason":"FailedScheduling","count":99}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return `{"items":[
{"kind":"","metadata":{"namespace":"monitoring","name":"blank-kind"}},
{"kind":"Job","metadata":{"namespace":"monitoring","name":"unsupported"}},
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"zero"},"spec":{"replicas":0}}
]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
orch.TestHookBeginStartupReport("scheduling-storm-noop")
defer orch.TestHookFinalizeStartupReport(nil)
lastAttempt := time.Time{}
orch.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
if lastAttempt.IsZero() {
t.Fatalf("expected successful scheduling-storm trigger to update lastAttempt")
}
if !scanRan {
t.Fatalf("expected scheduling-storm scan to execute")
}
progress := readStartupProgress(t, orch)
if strings.Contains(progress, "quarantined scheduling storm workload") {
t.Fatalf("expected no-op scheduling-storm scan to avoid auto-heal output, payload=%s", progress)
}
}
// TestHookSchedulingStormErrorMatrix runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormErrorMatrix(t *testing.T).
// Why: covers malformed/error response branches in the scheduling-storm scan so
// Ananke can surface precise diagnostics when the API itself is part of the problem.
func TestHookSchedulingStormErrorMatrix(t *testing.T) {
cases := []struct {
name string
run func(context.Context, time.Duration, string, ...string) (string, error)
wantErr string
}{
{
name: "pods-query-error",
run: func(_ context.Context, _ time.Duration, name string, _ ...string) (string, error) {
if name == "kubectl" {
return "", errors.New("pods boom")
}
return "", nil
},
wantErr: "query pods for scheduling storm scan",
},
{
name: "pods-decode-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
if name == "kubectl" && strings.Contains(strings.Join(args, " "), "get pods -A -o json") {
return "{", nil
}
return `{"items":[]}`, nil
},
wantErr: "decode pods for scheduling storm scan",
},
{
name: "replicasets-query-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return "", errors.New("replicasets boom")
default:
return "", nil
}
},
wantErr: "query replicasets for scheduling storm scan",
},
{
name: "replicasets-decode-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return "{", nil
default:
return `{"items":[]}`, nil
}
},
wantErr: "decode replicasets for scheduling storm scan",
},
{
name: "events-query-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return "", errors.New("events boom")
default:
return "", nil
}
},
wantErr: "query events for scheduling storm scan",
},
{
name: "events-decode-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return "{", nil
default:
return `{"items":[]}`, nil
}
},
wantErr: "decode events for scheduling storm scan",
},
{
name: "workloads-query-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return "", errors.New("workloads boom")
default:
return "", nil
}
},
wantErr: "query workloads for scheduling storm scan",
},
{
name: "workloads-decode-error",
run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return "{", nil
default:
return "", nil
}
},
wantErr: "decode workloads for scheduling storm scan",
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.AutoQuarantineSchedulingStorms = true
orch, _ := newHookOrchestrator(t, cfg, tc.run, tc.run)
err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
t.Fatalf("expected error containing %q, got %v", tc.wantErr, err)
}
})
}
}
// TestHookSchedulingStormScaleError runs one orchestration or CLI step.
// Signature: TestHookSchedulingStormScaleError(t *testing.T).
// Why: covers the final error path where Ananke detects a real storm but cannot
// scale the offending workload down.
func TestHookSchedulingStormScaleError(t *testing.T) {
now := time.Now().UTC().Format(time.RFC3339)
cfg := lifecycleConfig(t)
cfg.Startup.AutoQuarantineSchedulingStorms = true
cfg.Startup.SchedulingStormEventThreshold = 5
cfg.Startup.SchedulingStormWindowSeconds = 60
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}}]}`, nil
case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}}]}`, nil
case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
return `{"items":[{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45}]}`, nil
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}}]}`, nil
case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
return "", errors.New("scale denied")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
if err == nil || !strings.Contains(err.Error(), "scale scheduling storm workload ai/deployment/ollama to 0") {
t.Fatalf("expected scale error, got %v", err)
}
}

View File

@ -1,222 +0,0 @@
package orchestrator
import (
"context"
"errors"
"os"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
)
// readStartupProgress runs one orchestration or CLI step.
// Signature: readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string.
// Why: startup helper tests need to inspect progress artifacts without reaching
// into internal package state from the top-level testing module.
func readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string {
t.Helper()
payload, err := os.ReadFile(orch.TestHookStartupProgressPath())
if err != nil {
t.Fatalf("read startup progress: %v", err)
}
return string(payload)
}
// TestHookStartupScopeAndVaultHelpers runs one orchestration or CLI step.
// Signature: TestHookStartupScopeAndVaultHelpers(t *testing.T).
// Why: keeps startup-scope and startup-Vault helper branches covered from the
// split top-level testing module required by the repo hygiene contract.
func TestHookStartupScopeAndVaultHelpers(t *testing.T) {
t.Run("startup-scope-helpers", func(t *testing.T) {
nodes := []string{"titan-db", " titan-23 ", "", "titan-24"}
if got := cluster.TestHookStartupRequiredNodes(nodes, nil); len(got) != len(nodes) {
t.Fatalf("expected passthrough node list, got %v", got)
}
got := cluster.TestHookStartupRequiredNodes(nodes, []string{"titan-db", "titan-24"})
if len(got) != 2 || got[0] != "titan-db" || got[1] != "titan-24" {
t.Fatalf("unexpected filtered node list: %v", got)
}
if !cluster.TestHookContainsNode([]string{"titan-db", " titan-23 "}, " titan-23 ") {
t.Fatalf("expected trimmed node membership match")
}
if cluster.TestHookContainsNode([]string{"titan-db"}, " ") {
t.Fatalf("expected blank node probe to be ignored")
}
cfg := lifecycleConfig(t)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
if !orch.TestHookStartupNodeStrictlyRequired("titan-23") {
t.Fatalf("expected all nodes to be strict when no recovery scopes are configured")
}
cfgScoped := lifecycleConfig(t)
cfgScoped.Startup.NodeInventoryReachRequiredNodes = []string{"titan-23"}
cfgScoped.Startup.NodeSSHAuthRequiredNodes = []string{"titan-24"}
cfgScoped.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", " flux-system/gitea "}
cfgScoped.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault", " monitoring "}
orchScoped, _ := newHookOrchestrator(t, cfgScoped, nil, nil)
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-db") {
t.Fatalf("expected control plane to remain strict")
}
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-23") {
t.Fatalf("expected inventory-scoped node to remain strict")
}
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-24") {
t.Fatalf("expected ssh-scoped node to remain strict")
}
if orchScoped.TestHookStartupNodeStrictlyRequired("titan-25") {
t.Fatalf("expected non-core worker to stop being strict")
}
flux := orchScoped.TestHookStartupRequiredFluxKustomizations()
if _, ok := flux["flux-system/core"]; !ok {
t.Fatalf("expected core flux kustomization in required set: %v", flux)
}
if _, ok := flux["flux-system/gitea"]; !ok {
t.Fatalf("expected trimmed gitea kustomization in required set: %v", flux)
}
namespaces := orchScoped.TestHookStartupRequiredWorkloadNamespaces()
if _, ok := namespaces["vault"]; !ok {
t.Fatalf("expected vault namespace in required set: %v", namespaces)
}
if _, ok := namespaces["monitoring"]; !ok {
t.Fatalf("expected trimmed monitoring namespace in required set: %v", namespaces)
}
})
t.Run("startup-vault-helpers", func(t *testing.T) {
t.Run("early-vault-unseal-paths", func(t *testing.T) {
cfgAPI := lifecycleConfig(t)
runAPI := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
return "", errors.New("api down")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchAPI, _ := newHookOrchestrator(t, cfgAPI, runAPI, runAPI)
orchAPI.TestHookBeginStartupReport("startup-vault")
orchAPI.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchAPI); strings.Contains(payload, "vault-unseal-early") {
t.Fatalf("expected no early vault check when api is unavailable, payload=%s", payload)
}
cfgErr := lifecycleConfig(t)
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "v1.31.0", nil
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "", errors.New("phase probe failed")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
orchErr.TestHookBeginStartupReport("startup-vault")
orchErr.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchErr); !strings.Contains(payload, "deferred early vault unseal") {
t.Fatalf("expected early vault auto-heal detail, payload=%s", payload)
}
cfgDeferred := lifecycleConfig(t)
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "v1.31.0", nil
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Pending", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
orchDeferred.TestHookBeginStartupReport("startup-vault")
orchDeferred.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
t.Fatalf("expected deferred early vault detail, payload=%s", payload)
}
cfgSuccess := lifecycleConfig(t)
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "v1.31.0", nil
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
return `{"sealed":false,"initialized":true}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
orchSuccess.TestHookBeginStartupReport("startup-vault")
orchSuccess.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault-unseal-early"`) || !strings.Contains(payload, `"passed"`) {
t.Fatalf("expected successful early vault check, payload=%s", payload)
}
})
t.Run("startup-vault-gate-paths", func(t *testing.T) {
cfgErr := lifecycleConfig(t)
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
return "", errors.New("phase probe failed")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
orchErr.TestHookBeginStartupReport("startup-vault")
if err := orchErr.TestHookRunStartupVaultUnsealGate(context.Background()); err == nil || !strings.Contains(err.Error(), "phase probe failed") {
t.Fatalf("expected startup vault gate error, got %v", err)
}
cfgDeferred := lifecycleConfig(t)
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
return "Pending", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
orchDeferred.TestHookBeginStartupReport("startup-vault")
if err := orchDeferred.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
t.Fatalf("expected deferred startup vault gate to succeed, got %v", err)
}
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
t.Fatalf("expected deferred startup vault detail, payload=%s", payload)
}
cfgSuccess := lifecycleConfig(t)
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
return `{"sealed":false,"initialized":true}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
orchSuccess.TestHookBeginStartupReport("startup-vault")
if err := orchSuccess.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
t.Fatalf("expected successful startup vault gate, got %v", err)
}
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault is unsealed"`) {
t.Fatalf("expected successful startup vault detail, payload=%s", payload)
}
})
})
}

View File

@ -24,36 +24,12 @@ func TestStateTestHookOverrideSetters(t *testing.T) {
} }
restoreWriteNil() restoreWriteNil()
restoreReadNil := state.TestHookSetReadIntentOverride(nil)
readAfterNil, err := state.ReadIntent(intentPath)
if err != nil || readAfterNil.State != state.IntentNormal {
t.Fatalf("expected default read intent path after nil override, got %v / %v", readAfterNil, err)
}
restoreReadNil()
readOverrideCalled := false
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
readOverrideCalled = true
return state.Intent{}, errors.New("forced read override")
})
_, err = state.ReadIntent(intentPath)
if err == nil || !strings.Contains(err.Error(), "forced read override") {
t.Fatalf("expected forced read override error, got %v", err)
}
if !readOverrideCalled {
t.Fatalf("expected read override to be invoked")
}
restoreRead()
if _, err := state.TestHookReadIntentDefault(intentPath); err != nil {
t.Fatalf("expected explicit default read helper to succeed, got %v", err)
}
writeOverrideCalled := false writeOverrideCalled := false
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error { restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
writeOverrideCalled = true writeOverrideCalled = true
return errors.New("forced write override") return errors.New("forced write override")
}) })
err = state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal}) err := state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
if err == nil || !strings.Contains(err.Error(), "forced write override") { if err == nil || !strings.Contains(err.Error(), "forced write override") {
t.Fatalf("expected forced write override error, got %v", err) t.Fatalf("expected forced write override error, got %v", err)
} }