64 changed files with 1157 additions and 4539 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,4 @@
 /bin/
 /build/
 /dist/
 internal/state/.corrupt-*
 *.log
 *.tmp
--- a/201
+++ b/201
@ -1,59 +1,25 @@
 pipeline {
  agent {
    kubernetes {
      label 'ananke-quality'
      defaultContainer 'go-tester'
      yaml """
 apiVersion: v1
 kind: Pod
 spec:
  nodeSelector:
    hardware: rpi5
    kubernetes.io/arch: arm64
    node-role.kubernetes.io/worker: "true"
  affinity:
    nodeAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
        nodeSelectorTerms:
          - matchExpressions:
              - key: kubernetes.io/hostname
                operator: NotIn
                values:
                  - titan-06
      preferredDuringSchedulingIgnoredDuringExecution:
        - weight: 100
          preference:
            matchExpressions:
              - key: kubernetes.io/hostname
                operator: NotIn
                values:
                  - titan-13
                  - titan-15
                  - titan-17
                  - titan-19
  topologySpreadConstraints:
    - maxSkew: 1
      topologyKey: kubernetes.io/hostname
      whenUnsatisfiable: ScheduleAnyway
      labelSelector:
        matchLabels:
          jenkins/jenkins-jenkins-agent: "true"
  containers:
    - name: go-tester
-      image: registry.bstein.dev/bstein/golang:1.25-bookworm
+      image: golang:1.25-bookworm
      command: ["cat"]
      tty: true
      volumeMounts:
        - name: workspace-volume
          mountPath: /home/jenkins/agent
    - name: publisher
-      image: registry.bstein.dev/bstein/python:3.12-slim
+      image: python:3.12-slim
      command: ["cat"]
      tty: true
      volumeMounts:
        - name: workspace-volume
          mountPath: /home/jenkins/agent
    - name: quality-tools
      image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
      command: ["cat"]
      tty: true
      volumeMounts:
@ -69,13 +35,7 @@ spec:
  environment {
    SUITE_NAME = 'ananke'
    PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
    SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
    SONARQUBE_PROJECT_KEY = 'ananke'
    SONARQUBE_TOKEN = credentials('sonarqube-token')
    QUALITY_GATE_SONARQUBE_ENFORCE = '1'
    QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
    QUALITY_GATE_IRONBANK_ENFORCE = '1'
    QUALITY_GATE_IRONBANK_REQUIRED = '0'
    QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
  }
@ -97,27 +57,6 @@ spec:
    stage('Collect SonarQube evidence') {
      steps {
        container('quality-tools') {
          sh '''#!/usr/bin/env bash
            set -euo pipefail
            mkdir -p build
            args=(
              "-Dsonar.host.url=${SONARQUBE_HOST_URL}"
              "-Dsonar.login=${SONARQUBE_TOKEN}"
              "-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
              "-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
              "-Dsonar.sources=."
              "-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**"
              "-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
            )
            [ -f build/coverage.out ] && args+=("-Dsonar.go.coverage.reportPaths=build/coverage.out")
            set +e
            sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
            rc=${PIPESTATUS[0]}
            set -e
            printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
          '''
        }
        container('publisher') {
          sh '''
            set -eu
@ -156,34 +95,6 @@ PY
    stage('Collect Supply Chain evidence') {
      steps {
        container('quality-tools') {
          sh '''#!/usr/bin/env bash
            set -euo pipefail
            mkdir -p build
            set +e
            trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
            trivy_rc=$?
            set -e
            if [ ! -s build/trivy-fs.json ]; then
              cat > build/ironbank-compliance.json <<EOF
 {"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
 EOF
              exit 0
            fi
            critical="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="CRITICAL")] | length' build/trivy-fs.json)"
            high="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="HIGH")] | length' build/trivy-fs.json)"
            secrets="$(jq '[.Results[]? | .Secrets[]?] | length' build/trivy-fs.json)"
            misconfigs="$(jq '[.Results[]? | .Misconfigurations[]? | select(.Status=="FAIL" and (.Severity=="CRITICAL" or .Severity=="HIGH"))] | length' build/trivy-fs.json)"
            status=ok
            compliant=true
            if [ "${critical}" -gt 0 ] || [ "${secrets}" -gt 0 ] || [ "${misconfigs}" -gt 0 ]; then
              status=failed
              compliant=false
            fi
            jq -n --arg status "${status}" --argjson compliant "${compliant}" --argjson critical "${critical}" --argjson high "${high}" --argjson secrets "${secrets}" --argjson misconfigs "${misconfigs}" --argjson trivy_rc "${trivy_rc}" \
              '{status:$status, compliant:$compliant, category:"artifact_security", scan_type:"filesystem", scanner:"trivy", critical_vulnerabilities:$critical, high_vulnerabilities:$high, secrets:$secrets, high_or_critical_misconfigurations:$misconfigs, trivy_rc:$trivy_rc, high_vulnerability_policy:"observe"}' > build/ironbank-compliance.json
          '''
        }
        container('publisher') {
          sh '''
            set -eu
@ -241,25 +152,13 @@ PY
            failed_runs="$(awk -F= '$1=="failed"{print $2}' build/quality-gate.state 2>/dev/null | tail -n1)"
            [ -n "${ok_runs}" ] || ok_runs=0
            [ -n "${failed_runs}" ] || failed_runs=0
            coverage_percent="$(python3 - <<'PY'
 import re
 from pathlib import Path
 log_path = Path("build/quality-gate.out")
 text = log_path.read_text(encoding="utf-8", errors="ignore") if log_path.exists() else ""
 values = [float(match.group(1)) for match in re.finditer(r"([0-9]+(?:\\.[0-9]+)?)%", text)]
 print(values[-1] if values else 0.0)
 PY
 )"
            printf '%s\n' "${coverage_percent}" > build/coverage-percent.txt
            python3 scripts/publish_quality_metrics.py \
              --pushgateway-url "${PUSHGATEWAY_URL}" \
              --job-name platform-quality-ci \
              --suite "${SUITE_NAME}" \
              --trigger jenkins \
              --local-ok "${ok_runs}" \
-              --local-failed "${failed_runs}" \
+              --local-failed "${failed_runs}"
              --coverage-percent-file build/coverage-percent.txt
          '''
        }
      }
@ -270,95 +169,7 @@ PY
        container('publisher') {
          sh '''
            set -eu
-            gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)"
+            test "$(cat build/quality-gate.rc 2>/dev/null || echo 1)" -eq 0
            fail=0
            if [ "${gate_rc}" -ne 0 ]; then
              echo "quality gate failed with rc=${gate_rc}" >&2
              fail=1
            fi
            enabled() {
              case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
                1|true|yes|on) return 0 ;;
                *) return 1 ;;
              esac
            }
            if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
              sonar_status="$(python3 - <<'PY'
 import json
 from pathlib import Path
 path = Path("build/sonarqube-quality-gate.json")
 if not path.exists():
    print("missing")
    raise SystemExit(0)
 try:
    payload = json.loads(path.read_text(encoding="utf-8"))
 except Exception:  # noqa: BLE001
    print("error")
    raise SystemExit(0)
 status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
 print(status or "missing")
 PY
 )"
              case "${sonar_status}" in
                ok|pass|passed|success) ;;
                *)
                  echo "sonarqube gate failed: ${sonar_status}" >&2
                  fail=1
                  ;;
              esac
            fi
            ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
            if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
              ironbank_required=1
            fi
            if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
              supply_status="$(python3 - <<'PY'
 import json
 from pathlib import Path
 path = Path("build/ironbank-compliance.json")
 if not path.exists():
    print("missing")
    raise SystemExit(0)
 try:
    payload = json.loads(path.read_text(encoding="utf-8"))
 except Exception:  # noqa: BLE001
    print("error")
    raise SystemExit(0)
 compliant = payload.get("compliant")
 if compliant is True:
    print("ok")
 elif compliant is False:
    print("failed")
 else:
    status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
    print(status or "missing")
 PY
 )"
              case "${supply_status}" in
                ok|pass|passed|success|compliant) ;;
                not_applicable|na|n/a)
                  if enabled "${ironbank_required}"; then
                    echo "supply chain gate required but status=${supply_status}" >&2
                    fail=1
                  fi
                  ;;
                *)
                  if enabled "${ironbank_required}"; then
                    echo "supply chain gate failed: ${supply_status}" >&2
                    fail=1
                  else
                    echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
                  fi
                  ;;
              esac
            fi
            exit "${fail}"
          '''
        }
      }
@ -367,7 +178,7 @@ PY
  post {
    always {
-      archiveArtifacts artifacts: 'build/*.json,build/*.out,build/*.rc,build/*.txt,build/*.xml', allowEmptyArchive: true, fingerprint: true
+      archiveArtifacts artifacts: 'build/quality-gate.out,build/quality-gate.rc', allowEmptyArchive: true, fingerprint: true
    }
  }
 }
--- a/README.md
+++ b/README.md
@ -97,15 +97,10 @@ Primary config path:
 Keep these fields accurate:
 - `expected_flux_source_url`
 - `expected_flux_branch`
 - `startup.service_checklist_explicit_only`
 - `startup.service_checklist`
 - `startup.critical_service_endpoints`
 - `startup.require_ingress_checklist`
 - `startup.require_node_inventory_reachability`
 - `startup.node_inventory_reachability_required_nodes`
 - `startup.node_ssh_auth_required_nodes`
 - `startup.flux_health_required_kustomizations`
 - `startup.workload_convergence_required_namespaces`
 - `startup.ignore_unavailable_nodes`
 - `coordination.role`
 - `coordination.peer_hosts`
@ -139,10 +134,9 @@ Installer behavior:
 When adding nodes or services:
 1. Update inventory and node mapping in config.
-2. Keep the explicit service checklist focused on the core services that must come back during an outage.
+2. Add/adjust service checklist entries for anything user-facing or critical.
-3. Keep `*_required_*` startup scopes aligned with the same core set so optional stacks do not block bootstrap.
+3. Add/adjust ingress expectations for exposed services.
-4. Add/adjust ingress expectations for exposed services.
+4. Use temporary ignores only when truly intentional, then remove them.
-5. Use temporary ignores only when truly intentional, then remove them.
+5. Run `scripts/quality_gate.sh` before host deployment.
 6. Run `scripts/quality_gate.sh` before host deployment.
 Recovery quality should improve over time: every drill should reduce manual work in the next drill.
--- a/configs/ananke.example.yaml
+++ b/configs/ananke.example.yaml
@ -51,7 +51,6 @@ startup:
  require_node_inventory_reachability: true
  node_inventory_reachability_wait_seconds: 300
  node_inventory_reachability_poll_seconds: 5
  node_inventory_reachability_required_nodes: []
  required_node_labels:
    titan-09:
      ananke.bstein.dev/harbor-bootstrap: "true"
@ -91,7 +90,6 @@ startup:
    admin_secret_name: keycloak-admin
    admin_secret_username_key: username
    admin_secret_password_key: password
  service_checklist_explicit_only: false
  service_checklist:
    - name: gitea-api
      url: https://scm.bstein.dev/api/healthz
@ -136,26 +134,18 @@ startup:
  require_node_ssh_auth: true
  node_ssh_auth_wait_seconds: 240
  node_ssh_auth_poll_seconds: 5
  node_ssh_auth_required_nodes: []
  require_flux_health: true
  flux_health_wait_seconds: 900
  flux_health_poll_seconds: 5
  flux_health_required_kustomizations: []
  ignore_flux_kustomizations: []
  require_workload_convergence: true
  workload_convergence_wait_seconds: 900
  workload_convergence_poll_seconds: 5
  workload_convergence_required_namespaces: []
  ignore_workload_namespaces: []
  ignore_workloads: []
  ignore_unavailable_nodes: []
  auto_recycle_stuck_pods: true
  auto_quarantine_scheduling_storms: false
  scheduling_storm_event_threshold: 30
  scheduling_storm_window_seconds: 180
  stuck_pod_grace_seconds: 180
  post_start_auto_heal_seconds: 60
  dead_node_cleanup_grace_seconds: 300
  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
  vault_unseal_breakglass_command: ""
  vault_unseal_breakglass_timeout_seconds: 15
@ -180,7 +170,6 @@ ups:
      target: pyrphoros@localhost
  poll_seconds: 5
  runtime_safety_factor: 1.25
  on_battery_grace_seconds: 90
  debounce_count: 3
  telemetry_timeout_seconds: 90
 coordination:
--- a/configs/ananke.tethys.yaml
+++ b/configs/ananke.tethys.yaml
@ -117,52 +117,8 @@ startup:
  require_node_inventory_reachability: true
  node_inventory_reachability_wait_seconds: 300
  node_inventory_reachability_poll_seconds: 5
  node_inventory_reachability_required_nodes:
    - titan-0a
    - titan-0b
    - titan-0c
  required_node_labels:
    titan-04:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-05:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-06:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-07:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-08:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-11:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-12:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-13:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-14:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-15:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-17:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-18:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-19:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-09:
      node-role.kubernetes.io/worker: "true"
      ananke.bstein.dev/harbor-bootstrap: "true"
  require_time_sync: true
  time_sync_wait_seconds: 240
@ -200,7 +156,6 @@ startup:
    admin_secret_name: keycloak-admin
    admin_secret_username_key: username
    admin_secret_password_key: password
  service_checklist_explicit_only: true
  service_checklist:
    - name: gitea-api
      url: https://scm.bstein.dev/api/healthz
@ -245,49 +200,18 @@ startup:
  require_node_ssh_auth: true
  node_ssh_auth_wait_seconds: 240
  node_ssh_auth_poll_seconds: 5
  node_ssh_auth_required_nodes:
    - titan-0a
    - titan-0b
    - titan-0c
  require_flux_health: true
  flux_health_wait_seconds: 900
  flux_health_poll_seconds: 5
  flux_health_required_kustomizations:
    - flux-system/core
    - flux-system/helm
    - flux-system/traefik
    - flux-system/cert-manager
    - flux-system/longhorn
    - flux-system/vault-csi
    - flux-system/vault-injector
    - flux-system/postgres
    - flux-system/vault
    - flux-system/keycloak
    - flux-system/oauth2-proxy
    - flux-system/gitea
    - flux-system/monitoring
    - flux-system/harbor
  ignore_flux_kustomizations: []
  require_workload_convergence: true
  workload_convergence_wait_seconds: 900
  workload_convergence_poll_seconds: 5
  workload_convergence_required_namespaces:
    - vault
    - postgres
    - sso
    - gitea
    - monitoring
    - harbor
  ignore_workload_namespaces: []
  ignore_workloads: []
  ignore_unavailable_nodes: []
  auto_recycle_stuck_pods: true
  auto_quarantine_scheduling_storms: true
  scheduling_storm_event_threshold: 30
  scheduling_storm_window_seconds: 180
  stuck_pod_grace_seconds: 180
  post_start_auto_heal_seconds: 60
  dead_node_cleanup_grace_seconds: 300
  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
  vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
  vault_unseal_breakglass_timeout_seconds: 15
@ -311,7 +235,6 @@ ups:
      target: statera@localhost
  poll_seconds: 5
  runtime_safety_factor: 1.25
  on_battery_grace_seconds: 90
  debounce_count: 3
  telemetry_timeout_seconds: 90
 coordination:
--- a/configs/ananke.titan-db.yaml
+++ b/configs/ananke.titan-db.yaml
@ -117,52 +117,8 @@ startup:
  require_node_inventory_reachability: true
  node_inventory_reachability_wait_seconds: 300
  node_inventory_reachability_poll_seconds: 5
  node_inventory_reachability_required_nodes:
    - titan-0a
    - titan-0b
    - titan-0c
  required_node_labels:
    titan-04:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-05:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-06:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-07:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-08:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-11:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-12:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-13:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-14:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-15:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-17:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-18:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-19:
      node-role.kubernetes.io/worker: "true"
      longhorn-host: "true"
    titan-09:
      node-role.kubernetes.io/worker: "true"
      ananke.bstein.dev/harbor-bootstrap: "true"
  require_time_sync: true
  time_sync_wait_seconds: 240
@ -200,7 +156,6 @@ startup:
    admin_secret_name: keycloak-admin
    admin_secret_username_key: username
    admin_secret_password_key: password
  service_checklist_explicit_only: true
  service_checklist:
    - name: gitea-api
      url: https://scm.bstein.dev/api/healthz
@ -245,49 +200,18 @@ startup:
  require_node_ssh_auth: true
  node_ssh_auth_wait_seconds: 240
  node_ssh_auth_poll_seconds: 5
  node_ssh_auth_required_nodes:
    - titan-0a
    - titan-0b
    - titan-0c
  require_flux_health: true
  flux_health_wait_seconds: 900
  flux_health_poll_seconds: 5
  flux_health_required_kustomizations:
    - flux-system/core
    - flux-system/helm
    - flux-system/traefik
    - flux-system/cert-manager
    - flux-system/longhorn
    - flux-system/vault-csi
    - flux-system/vault-injector
    - flux-system/postgres
    - flux-system/vault
    - flux-system/keycloak
    - flux-system/oauth2-proxy
    - flux-system/gitea
    - flux-system/monitoring
    - flux-system/harbor
  ignore_flux_kustomizations: []
  require_workload_convergence: true
  workload_convergence_wait_seconds: 900
  workload_convergence_poll_seconds: 5
  workload_convergence_required_namespaces:
    - vault
    - postgres
    - sso
    - gitea
    - monitoring
    - harbor
  ignore_workload_namespaces: []
  ignore_workloads: []
  ignore_unavailable_nodes: []
  auto_recycle_stuck_pods: true
  auto_quarantine_scheduling_storms: true
  scheduling_storm_event_threshold: 30
  scheduling_storm_window_seconds: 180
  stuck_pod_grace_seconds: 180
  post_start_auto_heal_seconds: 60
  dead_node_cleanup_grace_seconds: 300
  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
  vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
  vault_unseal_breakglass_timeout_seconds: 15
@ -311,7 +235,6 @@ ups:
      target: pyrphoros@localhost
  poll_seconds: 5
  runtime_safety_factor: 1.25
  on_battery_grace_seconds: 90
  debounce_count: 3
  telemetry_timeout_seconds: 90
 coordination:
--- a/internal/cluster/orchestrator_access_fluxsource.go
+++ b/internal/cluster/orchestrator_access_fluxsource.go
@ -77,7 +77,7 @@ func (o *Orchestrator) waitForNodeSSHAuth(ctx context.Context, nodes []string) e
 	ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	seen := map[string]struct{}{}
 	targets := make([]string, 0, len(nodes))
-	for _, node := range startupRequiredNodes(nodes, o.cfg.Startup.NodeSSHAuthRequiredNodes) {
+	for _, node := range nodes {
 		node = strings.TrimSpace(node)
 		if node == "" {
 			continue
--- a/internal/cluster/orchestrator_autorepair.go
+++ b/internal/cluster/orchestrator_autorepair.go
@ -1,288 +0,0 @@
 package cluster
 import (
 	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"strings"
 	"time"
 )
 type nodeReadyList struct {
 	Items []struct {
 		Metadata struct {
 			Name string `json:"name"`
 		} `json:"metadata"`
 		Status struct {
 			Conditions []struct {
 				Type   string `json:"type"`
 				Status string `json:"status"`
 			} `json:"conditions"`
 		} `json:"status"`
 	} `json:"items"`
 }
 type podDeleteList struct {
 	Items []struct {
 		Metadata struct {
 			Namespace         string     `json:"namespace"`
 			Name              string     `json:"name"`
 			DeletionTimestamp *time.Time `json:"deletionTimestamp"`
 		} `json:"metadata"`
 		Spec struct {
 			NodeName string `json:"nodeName"`
 		} `json:"spec"`
 	} `json:"items"`
 }
 // RunPostStartAutoHeal runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error.
 // Why: gives the long-running daemon a narrow, testable repair entrypoint for
 // post-start drift without rerunning the full startup flow.
 func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error {
 	return o.postStartAutoHeal(ctx)
 }
 // postStartAutoHeal runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error.
 // Why: centralizes bounded post-start repair actions so recurring outage
 // patterns only trigger the specific remediation they need.
 func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error {
 	if o.runner.DryRun {
 		return nil
 	}
 	errs := []string{}
 	requestReconcile := false
 	if err := o.ensureRequiredNodeLabels(ctx); err != nil {
 		errs = append(errs, fmt.Sprintf("required node labels: %v", err))
 	}
 	vaultRecovered, err := o.autoRecoverSealedVault(ctx)
 	if err != nil {
 		errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err))
 	} else if vaultRecovered {
 		requestReconcile = true
 		if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil {
 			errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err))
 		}
 	}
 	cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx)
 	if err != nil {
 		errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err))
 	} else if cleaned > 0 {
 		requestReconcile = true
 	}
 	if requestReconcile {
 		o.bestEffort("request flux reconcile after post-start auto-heal", func() error {
 			return o.requestFluxReconcile(ctx)
 		})
 	}
 	if len(errs) > 0 {
 		return errors.New(strings.Join(errs, "; "))
 	}
 	return nil
 }
 // autoRecoverSealedVault runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error).
 // Why: lets the daemon repair a later Vault reseal without waiting for a new
 // bootstrap run.
 func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) {
 	if o.runner.DryRun {
 		return false, nil
 	}
 	phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
 	if err != nil {
 		if isNotFoundErr(err) {
 			return false, nil
 		}
 		return false, fmt.Errorf("vault pod phase check failed: %w", err)
 	}
 	if strings.TrimSpace(phase) != "Running" {
 		return false, nil
 	}
 	sealed, err := o.vaultSealed(ctx)
 	if err != nil {
 		return false, err
 	}
 	if !sealed {
 		return false, nil
 	}
 	o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery")
 	if err := o.ensureVaultUnsealed(ctx); err != nil {
 		return false, err
 	}
 	return true, nil
 }
 // rerunVaultK8sAuthConfigJob runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error.
 // Why: post-unseal Vault recovery needs the auth-config job retriggered so
 // downstream secret consumers stop carrying stale failures from the sealed window.
 func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error {
 	if o.runner.DryRun {
 		return nil
 	}
 	jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix())
 	if _, err := o.kubectl(
 		ctx,
 		25*time.Second,
 		"-n", "vault",
 		"create", "job",
 		"--from=cronjob/vault-k8s-auth-config",
 		jobName,
 	); err != nil {
 		return fmt.Errorf("create job %s: %w", jobName, err)
 	}
 	o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName)
 	return nil
 }
 // cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error).
 // Why: dead nodes can strand terminating pods indefinitely, so the daemon should
 // clear only that narrow failure class instead of leaving garbage behind forever.
 func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) {
 	if o.runner.DryRun {
 		return 0, nil
 	}
 	unavailable, err := o.unavailableNodeSet(ctx)
 	if err != nil {
 		return 0, err
 	}
 	if len(unavailable) == 0 {
 		return 0, nil
 	}
 	out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
 	if err != nil {
 		return 0, fmt.Errorf("query pods: %w", err)
 	}
 	var pods podDeleteList
 	if err := json.Unmarshal([]byte(out), &pods); err != nil {
 		return 0, fmt.Errorf("decode pods: %w", err)
 	}
 	grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second
 	now := time.Now()
 	count := 0
 	for _, item := range pods.Items {
 		if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" {
 			continue
 		}
 		if _, badNode := unavailable[item.Spec.NodeName]; !badNode {
 			continue
 		}
 		if now.Sub(*item.Metadata.DeletionTimestamp) < grace {
 			continue
 		}
 		o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName)
 		if _, err := o.kubectl(
 			ctx,
 			20*time.Second,
 			"-n", item.Metadata.Namespace,
 			"delete", "pod", item.Metadata.Name,
 			"--grace-period=0",
 			"--force",
 			"--wait=false",
 		); err != nil && !isNotFoundErr(err) {
 			return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err)
 		}
 		count++
 	}
 	if count > 0 {
 		o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count)
 	}
 	return count, nil
 }
 // unavailableNodeSet runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error).
 // Why: isolates Ready-condition parsing so dead-node cleanup stays targeted.
 func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) {
 	out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
 	if err != nil {
 		return nil, fmt.Errorf("query nodes: %w", err)
 	}
 	var nodes nodeReadyList
 	if err := json.Unmarshal([]byte(out), &nodes); err != nil {
 		return nil, fmt.Errorf("decode nodes: %w", err)
 	}
 	unavailable := map[string]struct{}{}
 	for _, item := range nodes.Items {
 		ready := ""
 		for _, cond := range item.Status.Conditions {
 			if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
 				ready = strings.TrimSpace(cond.Status)
 				break
 			}
 		}
 		if ready != "True" {
 			unavailable[item.Metadata.Name] = struct{}{}
 		}
 	}
 	return unavailable, nil
 }
 // requestFluxReconcile runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error.
 // Why: post-start repairs need a lightweight way to refresh GitOps health
 // without reusing the broader startup flux-resume flow.
 func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error {
 	if o.runner.DryRun {
 		return nil
 	}
 	now := time.Now().UTC().Format(time.RFC3339)
 	if _, err := o.kubectl(
 		ctx,
 		25*time.Second,
 		"-n", "flux-system",
 		"annotate", "gitrepository", "flux-system",
 		"reconcile.fluxcd.io/requestedAt="+now,
 		"--overwrite",
 	); err != nil {
 		return fmt.Errorf("annotate flux source reconcile: %w", err)
 	}
 	if _, err := o.kubectl(
 		ctx,
 		25*time.Second,
 		"-n", "flux-system",
 		"annotate",
 		"kustomizations.kustomize.toolkit.fluxcd.io",
 		"--all",
 		"reconcile.fluxcd.io/requestedAt="+now,
 		"--overwrite",
 	); err != nil {
 		return fmt.Errorf("annotate flux kustomizations reconcile: %w", err)
 	}
 	if _, err := o.kubectl(
 		ctx,
 		25*time.Second,
 		"annotate",
 		"--all-namespaces",
 		"helmreleases.helm.toolkit.fluxcd.io",
 		"--all",
 		"reconcile.fluxcd.io/requestedAt="+now,
 		"--overwrite",
 	); err != nil {
 		o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err)
 	}
 	if o.runOverride == nil && o.runner.CommandExists("flux") {
 		if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil {
 			o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err)
 		}
 	}
 	return nil
 }
--- a/internal/cluster/orchestrator_autorepair_cleanup_test.go
+++ b/internal/cluster/orchestrator_autorepair_cleanup_test.go
@ -1,296 +0,0 @@
 package cluster
 import (
 	"context"
 	"errors"
 	"io"
 	"log"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 	"scm.bstein.dev/bstein/ananke/internal/execx"
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
 // TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step.
 // Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T).
 // Why: cleanup on dead nodes must be selective so Ananke only force-deletes the
 // truly stranded pods and tolerates already-gone objects.
 func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) {
 	t.Run("dry run skips", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
 		orch.runner.DryRun = true
 		count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
 		if err != nil || count != 0 {
 			t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err)
 		}
 	})
 	t.Run("selective cleanup tolerates not found", func(t *testing.T) {
 		oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
 		recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339)
 		orch := buildOrchestratorWithStubs(t, config.Config{
 			Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
 		}, []commandStub{
 			{
 				match: matchContains("kubectl", "get nodes -o json"),
 				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
 			},
 			{
 				match: matchContains("kubectl", "get pods -A -o json"),
 				out: `{"items":[` +
 					`{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` +
 					`{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` +
 					`{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` +
 					`{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`,
 			},
 			{
 				match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
 				err:   errors.New("pod old-stale not found"),
 			},
 		})
 		count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
 		if err != nil {
 			t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err)
 		}
 		if count != 1 {
 			t.Fatalf("expected one cleaned pod, got %d", count)
 		}
 	})
 	t.Run("query and decode errors surface", func(t *testing.T) {
 		queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "get nodes -o json"),
 				err:   errors.New("nodes failed"),
 			},
 		})
 		if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") {
 			t.Fatalf("expected node query error, got %v", err)
 		}
 		decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "get nodes -o json"),
 				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`,
 			},
 			{
 				match: matchContains("kubectl", "get pods -A -o json"),
 				out:   `{bad json`,
 			},
 		})
 		if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") {
 			t.Fatalf("expected pod decode error, got %v", err)
 		}
 	})
 	t.Run("delete hard error surfaces", func(t *testing.T) {
 		oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
 		orch := buildOrchestratorWithStubs(t, config.Config{
 			Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
 		}, []commandStub{
 			{
 				match: matchContains("kubectl", "get nodes -o json"),
 				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`,
 			},
 			{
 				match: matchContains("kubectl", "get pods -A -o json"),
 				out:   `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`,
 			},
 			{
 				match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
 				err:   errors.New("delete failed"),
 			},
 		})
 		count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
 		if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") {
 			t.Fatalf("expected delete failure, got count=%d err=%v", count, err)
 		}
 	})
 }
 // TestUnavailableNodeSetBranches runs one orchestration or CLI step.
 // Signature: TestUnavailableNodeSetBranches(t *testing.T).
 // Why: node Ready parsing drives dead-node cleanup, so malformed and missing
 // Ready condition payloads need direct coverage too.
 func TestUnavailableNodeSetBranches(t *testing.T) {
 	t.Run("decode error surfaces", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
 		})
 		if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") {
 			t.Fatalf("expected decode error, got %v", err)
 		}
 	})
 	t.Run("missing ready condition counts as unavailable", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "get nodes -o json"),
 				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
 			},
 		})
 		nodes, err := orch.unavailableNodeSet(context.Background())
 		if err != nil {
 			t.Fatalf("unavailableNodeSet failed: %v", err)
 		}
 		if _, ok := nodes["titan-22"]; !ok {
 			t.Fatalf("expected titan-22 to be treated as unavailable")
 		}
 		if _, ok := nodes["titan-07"]; ok {
 			t.Fatalf("did not expect titan-07 to be treated as unavailable")
 		}
 	})
 }
 // TestRequestFluxReconcileBranches runs one orchestration or CLI step.
 // Signature: TestRequestFluxReconcileBranches(t *testing.T).
 // Why: the post-start repair loop needs predictable Flux refresh behavior even
 // when one annotation call is flaky.
 func TestRequestFluxReconcileBranches(t *testing.T) {
 	t.Run("dry run skips", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
 		orch.runner.DryRun = true
 		if err := orch.requestFluxReconcile(context.Background()); err != nil {
 			t.Fatalf("dry-run requestFluxReconcile failed: %v", err)
 		}
 	})
 	t.Run("git source annotate error surfaces", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
 				err:   errors.New("annotate failed"),
 			},
 		})
 		if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") {
 			t.Fatalf("expected gitrepository annotate error, got %v", err)
 		}
 	})
 	t.Run("kustomization annotate error surfaces", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
 				out:   "",
 			},
 			{
 				match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="),
 				err:   errors.New("annotate failed"),
 			},
 		})
 		if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") {
 			t.Fatalf("expected kustomization annotate error, got %v", err)
 		}
 	})
 	t.Run("helm annotate warning and flux command path", func(t *testing.T) {
 		tmpDir := t.TempDir()
 		callLog := filepath.Join(tmpDir, "calls.log")
 		kubectlPath := filepath.Join(tmpDir, "kubectl")
 		fluxPath := filepath.Join(tmpDir, "flux")
 		kubectlScript := "#!/bin/sh\n" +
 			"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
 			"case \"$*\" in\n" +
 			"  *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" +
 			"esac\n" +
 			"exit 0\n"
 		fluxScript := "#!/bin/sh\n" +
 			"printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
 			"exit 0\n"
 		if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
 			t.Fatalf("write fake kubectl: %v", err)
 		}
 		if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
 			t.Fatalf("write fake flux: %v", err)
 		}
 		t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
 		cfg := config.Config{
 			State: config.State{
 				Dir:            t.TempDir(),
 				ReportsDir:     filepath.Join(t.TempDir(), "reports"),
 				RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
 			},
 		}
 		orch := &Orchestrator{
 			cfg:    cfg,
 			runner: &execx.Runner{},
 			store:  state.New(cfg.State.RunHistoryPath),
 			log:    log.New(io.Discard, "", 0),
 		}
 		if err := orch.requestFluxReconcile(context.Background()); err != nil {
 			t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err)
 		}
 		calls, err := os.ReadFile(callLog)
 		if err != nil {
 			t.Fatalf("read fake command log: %v", err)
 		}
 		logText := string(calls)
 		if !strings.Contains(logText, "annotate gitrepository flux-system") {
 			t.Fatalf("expected gitrepository annotate call, got %q", logText)
 		}
 		if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") {
 			t.Fatalf("expected kustomization annotate call, got %q", logText)
 		}
 		if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") {
 			t.Fatalf("expected flux reconcile command, got %q", logText)
 		}
 	})
 	t.Run("flux command failure is tolerated", func(t *testing.T) {
 		tmpDir := t.TempDir()
 		callLog := filepath.Join(tmpDir, "calls.log")
 		kubectlPath := filepath.Join(tmpDir, "kubectl")
 		fluxPath := filepath.Join(tmpDir, "flux")
 		kubectlScript := "#!/bin/sh\n" +
 			"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
 			"exit 0\n"
 		fluxScript := "#!/bin/sh\n" +
 			"printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
 			"exit 1\n"
 		if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
 			t.Fatalf("write fake kubectl: %v", err)
 		}
 		if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
 			t.Fatalf("write fake flux: %v", err)
 		}
 		t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
 		cfg := config.Config{
 			State: config.State{
 				Dir:            t.TempDir(),
 				ReportsDir:     filepath.Join(t.TempDir(), "reports"),
 				RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
 			},
 		}
 		orch := &Orchestrator{
 			cfg:    cfg,
 			runner: &execx.Runner{},
 			store:  state.New(cfg.State.RunHistoryPath),
 			log:    log.New(io.Discard, "", 0),
 		}
 		if err := orch.requestFluxReconcile(context.Background()); err != nil {
 			t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err)
 		}
 		calls, err := os.ReadFile(callLog)
 		if err != nil {
 			t.Fatalf("read fake command log: %v", err)
 		}
 		if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") {
 			t.Fatalf("expected failing flux command to be attempted, got %q", string(calls))
 		}
 	})
 }
--- a/internal/cluster/orchestrator_autorepair_test.go
+++ b/internal/cluster/orchestrator_autorepair_test.go
@ -1,382 +0,0 @@
 package cluster
 import (
 	"context"
 	"encoding/base64"
 	"errors"
 	"io"
 	"log"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 	"scm.bstein.dev/bstein/ananke/internal/execx"
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
 // TestPostStartAutoHealRepairsVaultAndUnavailableNodes runs one orchestration or CLI step.
 // Signature: TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T).
 // Why: covers the new daemon-triggered repair path for late Vault reseals and
 // stale terminating pods anchored to unavailable nodes.
 func TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T) {
 	cfg := config.Config{
 		Startup: config.Startup{
 			DeadNodeCleanupGraceSeconds: 300,
 			RequiredNodeLabels: map[string]map[string]string{
 				"titan-07": {"node-role.kubernetes.io/worker": "true"},
 			},
 		},
 		State: config.State{
 			Dir:            t.TempDir(),
 			ReportsDir:     filepath.Join(t.TempDir(), "reports"),
 			RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
 		},
 	}
 	orch := &Orchestrator{
 		cfg:    cfg,
 		runner: &execx.Runner{},
 		store:  state.New(filepath.Join(t.TempDir(), "runs.json")),
 		log:    log.New(io.Discard, "", 0),
 	}
 	oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
 	unsealCalls := 0
 	jobCreated := false
 	reconciled := false
 	deleted := map[string]bool{}
 	dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
 		if name != "kubectl" {
 			return "", nil
 		}
 		joined := strings.Join(args, " ")
 		switch {
 		case strings.Contains(joined, "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"):
 			return "", nil
 		case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
 			return "Running", nil
 		case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
 			if unsealCalls == 0 {
 				return `{"initialized":true,"sealed":true}`, nil
 			}
 			return `{"initialized":true,"sealed":false}`, nil
 		case strings.Contains(joined, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"):
 			return base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), nil
 		case strings.Contains(joined, "vault operator unseal"):
 			unsealCalls++
 			return "", nil
 		case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
 			jobCreated = true
 			return "", nil
 		case strings.Contains(joined, "get nodes -o json"):
 			return `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
 		case strings.Contains(joined, "get pods -A -o json"):
 			return `{"items":[{"metadata":{"namespace":"maintenance","name":"stale-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},{"metadata":{"namespace":"logging","name":"healthy-node-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}}]}`, nil
 		case strings.Contains(joined, "-n maintenance delete pod stale-pod --grace-period=0 --force --wait=false"):
 			deleted["maintenance/stale-pod"] = true
 			return "", nil
 		case strings.Contains(joined, "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="):
 			reconciled = true
 			return "", nil
 		case strings.Contains(joined, "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
 			return "", nil
 		case strings.Contains(joined, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
 			return "", nil
 		default:
 			return "", nil
 		}
 	}
 	orch.SetCommandOverrides(dispatch, dispatch)
 	if err := orch.postStartAutoHeal(context.Background()); err != nil {
 		t.Fatalf("postStartAutoHeal failed: %v", err)
 	}
 	if unsealCalls != 1 {
 		t.Fatalf("expected one Vault unseal attempt, got %d", unsealCalls)
 	}
 	if !jobCreated {
 		t.Fatalf("expected vault k8s auth config job to be created")
 	}
 	if !deleted["maintenance/stale-pod"] {
 		t.Fatalf("expected stale unavailable-node pod to be deleted")
 	}
 	if !reconciled {
 		t.Fatalf("expected flux reconcile request after repairs")
 	}
 	if deleted["logging/healthy-node-pod"] {
 		t.Fatalf("did not expect terminating pod on healthy node to be deleted")
 	}
 }
 // TestPostStartAutoHealSkipsWhenClusterIsHealthy runs one orchestration or CLI step.
 // Signature: TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T).
 // Why: proves the new post-start repair loop stays quiet when the specific
 // failure patterns are absent.
 func TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T) {
 	cfg := config.Config{
 		Startup: config.Startup{
 			DeadNodeCleanupGraceSeconds: 300,
 		},
 		State: config.State{
 			Dir:            t.TempDir(),
 			ReportsDir:     filepath.Join(t.TempDir(), "reports"),
 			RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
 		},
 	}
 	orch := &Orchestrator{
 		cfg:    cfg,
 		runner: &execx.Runner{},
 		store:  state.New(filepath.Join(t.TempDir(), "runs.json")),
 		log:    log.New(io.Discard, "", 0),
 	}
 	unsealCalls := 0
 	jobCreated := false
 	reconciled := false
 	dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
 		if name != "kubectl" {
 			return "", nil
 		}
 		joined := strings.Join(args, " ")
 		switch {
 		case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
 			return "Running", nil
 		case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
 			return `{"initialized":true,"sealed":false}`, nil
 		case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
 			jobCreated = true
 			return "", nil
 		case strings.Contains(joined, "vault operator unseal"):
 			unsealCalls++
 			return "", nil
 		case strings.Contains(joined, "get nodes -o json"):
 			return `{"items":[{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
 		case strings.Contains(joined, "get pods -A -o json"):
 			return `{"items":[]}`, nil
 		case strings.Contains(joined, "reconcile.fluxcd.io/requestedAt="):
 			reconciled = true
 			return "", nil
 		default:
 			return "", nil
 		}
 	}
 	orch.SetCommandOverrides(dispatch, dispatch)
 	if err := orch.postStartAutoHeal(context.Background()); err != nil {
 		t.Fatalf("postStartAutoHeal failed: %v", err)
 	}
 	if unsealCalls != 0 {
 		t.Fatalf("did not expect Vault unseal calls, got %d", unsealCalls)
 	}
 	if jobCreated {
 		t.Fatalf("did not expect vault auth config job creation")
 	}
 	if reconciled {
 		t.Fatalf("did not expect flux reconcile request for healthy cluster")
 	}
 }
 // TestRunPostStartAutoHealDryRun runs one orchestration or CLI step.
 // Signature: TestRunPostStartAutoHealDryRun(t *testing.T).
 // Why: covers the exported wrapper and the top-level dry-run guard so daemon
 // auto-heal never mutates cluster state during rehearsal runs.
 func TestRunPostStartAutoHealDryRun(t *testing.T) {
 	orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
 	orch.runner.DryRun = true
 	if err := orch.RunPostStartAutoHeal(context.Background()); err != nil {
 		t.Fatalf("RunPostStartAutoHeal dry-run failed: %v", err)
 	}
 }
 // TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step.
 // Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T).
 // Why: proves the daemon reports each failed sub-repair together instead of
 // hiding later failures behind the first problem.
 func TestPostStartAutoHealAggregatesErrors(t *testing.T) {
 	cfg := config.Config{
 		Startup: config.Startup{
 			DeadNodeCleanupGraceSeconds: 300,
 			RequiredNodeLabels: map[string]map[string]string{
 				"titan-07": {"node-role.kubernetes.io/worker": "true"},
 			},
 		},
 	}
 	orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
 		{
 			match: matchContains("kubectl", "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"),
 			err:   errors.New("label failed"),
 		},
 		{
 			match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
 			err:   errors.New("vault phase failed"),
 		},
 		{
 			match: matchContains("kubectl", "get nodes -o json"),
 			err:   errors.New("node query failed"),
 		},
 	})
 	err := orch.postStartAutoHeal(context.Background())
 	if err == nil {
 		t.Fatalf("expected aggregated error")
 	}
 	msg := err.Error()
 	for _, want := range []string{
 		"required node labels:",
 		"vault auto-recovery:",
 		"dead-node terminating pod cleanup:",
 	} {
 		if !strings.Contains(msg, want) {
 			t.Fatalf("expected %q in %q", want, msg)
 		}
 	}
 }
 // TestAutoRecoverSealedVaultBranches runs one orchestration or CLI step.
 // Signature: TestAutoRecoverSealedVaultBranches(t *testing.T).
 // Why: late Vault reseals are a high-risk failure path, so the daemon needs
 // coverage across the quiet-skip, parse-failure, and unseal-failure branches.
 func TestAutoRecoverSealedVaultBranches(t *testing.T) {
 	t.Run("dry run skips", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
 		orch.runner.DryRun = true
 		recovered, err := orch.autoRecoverSealedVault(context.Background())
 		if err != nil || recovered {
 			t.Fatalf("expected dry-run skip, got recovered=%v err=%v", recovered, err)
 		}
 	})
 	t.Run("pod missing is quiet", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
 				err:   errors.New("vault-0 not found"),
 			},
 		})
 		recovered, err := orch.autoRecoverSealedVault(context.Background())
 		if err != nil || recovered {
 			t.Fatalf("expected quiet skip, got recovered=%v err=%v", recovered, err)
 		}
 	})
 	t.Run("phase check error surfaces", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
 				err:   errors.New("phase check failed"),
 			},
 		})
 		recovered, err := orch.autoRecoverSealedVault(context.Background())
 		if recovered || err == nil || !strings.Contains(err.Error(), "vault pod phase check failed") {
 			t.Fatalf("expected phase check error, got recovered=%v err=%v", recovered, err)
 		}
 	})
 	t.Run("non-running pod defers", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
 				out:   "Pending",
 			},
 		})
 		recovered, err := orch.autoRecoverSealedVault(context.Background())
 		if err != nil || recovered {
 			t.Fatalf("expected pending pod skip, got recovered=%v err=%v", recovered, err)
 		}
 	})
 	t.Run("status parse failure surfaces", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
 				out:   "Running",
 			},
 			{
 				match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
 				out:   "garbage",
 			},
 		})
 		recovered, err := orch.autoRecoverSealedVault(context.Background())
 		if recovered || err == nil || !strings.Contains(err.Error(), "parse vault status") {
 			t.Fatalf("expected parse error, got recovered=%v err=%v", recovered, err)
 		}
 	})
 	t.Run("already unsealed stays quiet", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
 				out:   "Running",
 			},
 			{
 				match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
 				out:   `{"sealed":false}`,
 			},
 		})
 		recovered, err := orch.autoRecoverSealedVault(context.Background())
 		if err != nil || recovered {
 			t.Fatalf("expected already-unsealed skip, got recovered=%v err=%v", recovered, err)
 		}
 	})
 	t.Run("unseal failure surfaces", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
 				out:   "Running",
 			},
 			{
 				match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
 				out:   `{"sealed":true}`,
 			},
 			{
 				match: matchContains("kubectl", "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"),
 				out:   base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")),
 			},
 			{
 				match: matchContains("kubectl", "vault operator unseal"),
 				err:   errors.New("exec boom"),
 			},
 		})
 		recovered, err := orch.autoRecoverSealedVault(context.Background())
 		if recovered || err == nil || !strings.Contains(err.Error(), "vault unseal attempt 1 failed") {
 			t.Fatalf("expected unseal failure, got recovered=%v err=%v", recovered, err)
 		}
 	})
 }
 // TestRerunVaultK8sAuthConfigJobBranches runs one orchestration or CLI step.
 // Signature: TestRerunVaultK8sAuthConfigJobBranches(t *testing.T).
 // Why: the post-unseal auth job is part of the production recovery chain, so
 // dry-run and create-error behavior both need explicit coverage.
 func TestRerunVaultK8sAuthConfigJobBranches(t *testing.T) {
 	t.Run("dry run skips", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
 		orch.runner.DryRun = true
 		if err := orch.rerunVaultK8sAuthConfigJob(context.Background()); err != nil {
 			t.Fatalf("dry-run rerunVaultK8sAuthConfigJob failed: %v", err)
 		}
 	})
 	t.Run("create error surfaces", func(t *testing.T) {
 		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
 			{
 				match: matchContains("kubectl", "-n vault create job --from=cronjob/vault-k8s-auth-config"),
 				err:   errors.New("create failed"),
 			},
 		})
 		err := orch.rerunVaultK8sAuthConfigJob(context.Background())
 		if err == nil || !strings.Contains(err.Error(), "create job vault-k8s-auth-config-autoheal-") {
 			t.Fatalf("expected create-job error, got %v", err)
 		}
 	})
 }
--- a/internal/cluster/orchestrator_critical_vault.go
+++ b/internal/cluster/orchestrator_critical_vault.go
@ -227,31 +227,6 @@ func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) er
 	return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
 }
 // ensureVaultUnsealedWhenRunnable runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error).
 // Why: lets startup defer vault unseal until the pod is actually runnable, while
 // keeping the direct unseal helper strict for explicit recovery paths and tests.
 func (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error) {
 	if o.runner.DryRun {
 		return false, "", nil
 	}
 	phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
 	if err != nil {
 		if isNotFoundErr(err) {
 			return true, "vault-0 pod is not present yet; deferring unseal until critical workload recovery", nil
 		}
 		return false, "", fmt.Errorf("vault pod phase check failed: %w", err)
 	}
 	trimmedPhase := strings.TrimSpace(phase)
 	if trimmedPhase != "Running" {
 		return true, fmt.Sprintf("vault-0 pod phase is %q; deferring unseal until critical workload recovery", trimmedPhase), nil
 	}
 	return false, "", o.ensureVaultUnsealed(ctx)
 }
 // ensureVaultUnsealed runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
--- a/internal/cluster/orchestrator_fluxhealth.go
+++ b/internal/cluster/orchestrator_fluxhealth.go
@ -143,8 +143,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
 		return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
 	}
 	ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
 	required := o.startupRequiredFluxKustomizations()
 	requiredSeen := map[string]struct{}{}
 	notReady := []string{}
 	for _, ks := range list.Items {
 		ns := strings.TrimSpace(ks.Metadata.Namespace)
@ -156,12 +154,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
 		if ks.Spec.Suspend {
 			continue
 		}
 		if len(required) > 0 {
 			if _, ok := required[full]; !ok {
 				continue
 			}
 			requiredSeen[full] = struct{}{}
 		}
 		if _, ok := ignored[full]; ok {
 			continue
 		}
@ -181,25 +173,10 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
 		}
 		notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
 	}
 	if len(required) > 0 {
 		missing := []string{}
 		for full := range required {
 			if _, ok := requiredSeen[full]; !ok {
 				missing = append(missing, full+"(missing)")
 			}
 		}
 		if len(missing) > 0 {
 			sort.Strings(missing)
 			notReady = append(notReady, missing...)
 		}
 	}
 	if len(notReady) > 0 {
 		sort.Strings(notReady)
 		return false, "not ready: " + joinLimited(notReady, 6), nil
 	}
 	if len(required) > 0 {
 		return true, fmt.Sprintf("required kustomizations ready=%d", len(requiredSeen)), nil
 	}
 	return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
 }
--- a/internal/cluster/orchestrator_ingress.go
+++ b/internal/cluster/orchestrator_ingress.go
@ -19,7 +19,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
 	if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
 		return nil
 	}
 	ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
 	for node := range o.cfg.Startup.RequiredNodeLabels {
 		node = strings.TrimSpace(node)
@ -29,10 +28,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
 	}
 	sort.Strings(nodes)
 	for _, node := range nodes {
 		if _, skip := ignored[node]; skip {
 			o.log.Printf("skipping required node labels for ignored unavailable node %s", node)
 			continue
 		}
 		labels := o.cfg.Startup.RequiredNodeLabels[node]
 		if len(labels) == 0 {
 			continue
@ -60,11 +55,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
 			continue
 		}
 		if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
 			if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {
 				o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)
 				o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))
 				continue
 			}
 			return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
 		}
 		o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))
--- a/internal/cluster/orchestrator_lifecycle.go
+++ b/internal/cluster/orchestrator_lifecycle.go
@ -37,7 +37,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
 		return invErr
 	}
 	o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
 	o.maybeRunEarlyVaultUnseal(ctx)
 	o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
 	if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
 		o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
@ -180,9 +179,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
 		}
 	}
 	o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
 	if err := o.runStartupVaultUnsealGate(ctx); err != nil {
 		return err
 	}
 	if err := o.ensureRequiredNodeLabels(ctx); err != nil {
 		return err
 	}
@ -480,3 +476,18 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
 	o.log.Printf("shutdown flow complete")
 	return nil
 }
 // normalizeShutdownMode runs one orchestration or CLI step.
 // Signature: normalizeShutdownMode(raw string) (string, error).
 // Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
 // semantics while preserving compatibility with legacy "config" callers.
 func normalizeShutdownMode(raw string) (string, error) {
 	switch strings.TrimSpace(raw) {
 	case "", "config", "cluster-only":
 		return "cluster-only", nil
 	case "poweroff":
 		return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
 	default:
 		return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
 	}
 }
--- a/internal/cluster/orchestrator_node_reachability.go
+++ b/internal/cluster/orchestrator_node_reachability.go
@ -30,7 +30,7 @@ func (o *Orchestrator) waitForNodeInventoryReachability(ctx context.Context) err
 	ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	targets := make([]string, 0, len(o.inventoryNodesForValidation()))
 	seen := map[string]struct{}{}
-	for _, node := range startupRequiredNodes(o.inventoryNodesForValidation(), o.cfg.Startup.NodeInventoryReachRequiredNodes) {
+	for _, node := range o.inventoryNodesForValidation() {
 		node = strings.TrimSpace(node)
 		if node == "" {
 			continue
--- a/internal/cluster/orchestrator_scheduling_storm.go
+++ b/internal/cluster/orchestrator_scheduling_storm.go
@ -1,261 +0,0 @@
 package cluster
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"sort"
 	"strings"
 	"time"
 )
 // maybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
 // Why: a non-core workload that cannot schedule can emit enough warning events to
 // thrash the control plane datastore; quarantine keeps startup moving while
 // preserving core services.
 func (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
 	if o.runner.DryRun || !o.cfg.Startup.AutoQuarantineSchedulingStorms {
 		return
 	}
 	now := time.Now()
 	if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second {
 		return
 	}
 	if lastAttempt != nil {
 		*lastAttempt = now
 	}
 	o.bestEffort("quarantine non-core scheduling storm workloads", func() error {
 		return o.quarantineSchedulingStormWorkloads(ctx)
 	})
 }
 // quarantineSchedulingStormWorkloads runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error.
 // Why: limits startup-only mitigation to workloads proven to be generating a
 // scheduling event storm, instead of scaling optional apps down blindly.
 func (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error {
 	podsOut, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
 	if err != nil {
 		return fmt.Errorf("query pods for scheduling storm scan: %w", err)
 	}
 	var pods podList
 	if err := json.Unmarshal([]byte(podsOut), &pods); err != nil {
 		return fmt.Errorf("decode pods for scheduling storm scan: %w", err)
 	}
 	rsOut, err := o.kubectl(ctx, 30*time.Second, "get", "replicasets", "-A", "-o", "json")
 	if err != nil {
 		return fmt.Errorf("query replicasets for scheduling storm scan: %w", err)
 	}
 	var rsList replicaSetList
 	if err := json.Unmarshal([]byte(rsOut), &rsList); err != nil {
 		return fmt.Errorf("decode replicasets for scheduling storm scan: %w", err)
 	}
 	eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
 	if err != nil {
 		return fmt.Errorf("query events for scheduling storm scan: %w", err)
 	}
 	var events eventList
 	if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
 		return fmt.Errorf("decode events for scheduling storm scan: %w", err)
 	}
 	workloadsOut, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
 	if err != nil {
 		return fmt.Errorf("query workloads for scheduling storm scan: %w", err)
 	}
 	var workloads workloadList
 	if err := json.Unmarshal([]byte(workloadsOut), &workloads); err != nil {
 		return fmt.Errorf("decode workloads for scheduling storm scan: %w", err)
 	}
 	requiredNamespaces := o.startupRequiredWorkloadNamespaces()
 	ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
 	ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
 	eventThreshold := o.cfg.Startup.SchedulingStormEventThreshold
 	if eventThreshold <= 0 {
 		eventThreshold = 30
 	}
 	window := time.Duration(o.cfg.Startup.SchedulingStormWindowSeconds) * time.Second
 	if window <= 0 {
 		window = 3 * time.Minute
 	}
 	podsByKey := map[string]podResource{}
 	for _, pod := range pods.Items {
 		ns := strings.TrimSpace(pod.Metadata.Namespace)
 		name := strings.TrimSpace(pod.Metadata.Name)
 		if ns == "" || name == "" {
 			continue
 		}
 		podsByKey[ns+"/"+name] = pod
 	}
 	rsOwners := map[string]ownerReference{}
 	for _, rs := range rsList.Items {
 		ns := strings.TrimSpace(rs.Metadata.Namespace)
 		name := strings.TrimSpace(rs.Metadata.Name)
 		if ns == "" || name == "" {
 			continue
 		}
 		for _, owner := range rs.Metadata.OwnerReferences {
 			kind := strings.TrimSpace(owner.Kind)
 			ownerName := strings.TrimSpace(owner.Name)
 			if kind == "" || ownerName == "" {
 				continue
 			}
 			rsOwners[ns+"/"+name] = owner
 			break
 		}
 	}
 	workloadDesired := map[string]int32{}
 	for _, item := range workloads.Items {
 		kind := strings.ToLower(strings.TrimSpace(item.Kind))
 		ns := strings.TrimSpace(item.Metadata.Namespace)
 		name := strings.TrimSpace(item.Metadata.Name)
 		if kind == "" || ns == "" || name == "" {
 			continue
 		}
 		desired, _, ok := desiredReady(item)
 		if !ok {
 			continue
 		}
 		workloadDesired[ns+"/"+kind+"/"+name] = desired
 	}
 	quarantined := []string{}
 	seen := map[string]struct{}{}
 	now := time.Now()
 	for _, event := range events.Items {
 		if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
 			continue
 		}
 		if strings.TrimSpace(event.Reason) != "FailedScheduling" {
 			continue
 		}
 		if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
 			continue
 		}
 		lastSeen := eventLastObservedAt(event)
 		if !lastSeen.IsZero() && now.Sub(lastSeen) > window {
 			continue
 		}
 		count := eventObservationCount(event)
 		if count < eventThreshold {
 			continue
 		}
 		podKey := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
 		pod, ok := podsByKey[podKey]
 		if !ok {
 			continue
 		}
 		if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
 			continue
 		}
 		ns := strings.TrimSpace(pod.Metadata.Namespace)
 		if _, ok := requiredNamespaces[ns]; ok {
 			continue
 		}
 		if _, ok := ignoredNamespaces[ns]; ok {
 			continue
 		}
 		if workloadIgnored(ignoreRules, ns, "", strings.TrimSpace(pod.Metadata.Name)) {
 			continue
 		}
 		if podTargetsIgnoredNode(pod, ignoredNodes) {
 			continue
 		}
 		workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
 		if !ok {
 			continue
 		}
 		if workloadIgnored(ignoreRules, workload.Namespace, workload.Kind, workload.Name) {
 			continue
 		}
 		workloadKey := workload.Namespace + "/" + workload.Kind + "/" + workload.Name
 		if _, done := seen[workloadKey]; done {
 			continue
 		}
 		desired := workloadDesired[workloadKey]
 		if desired <= 0 {
 			continue
 		}
 		if err := o.ensureWorkloadReplicas(ctx, workload, 0); err != nil {
 			return fmt.Errorf("scale scheduling storm workload %s to 0: %w", workloadKey, err)
 		}
 		seen[workloadKey] = struct{}{}
 		quarantined = append(quarantined, fmt.Sprintf("%s events=%d window=%s", workloadKey, count, window))
 	}
 	if len(quarantined) == 0 {
 		return nil
 	}
 	sort.Strings(quarantined)
 	detail := fmt.Sprintf("quarantined scheduling storm workload(s): %s", joinLimited(quarantined, 8))
 	o.log.Printf("%s", detail)
 	o.noteStartupAutoHeal(detail)
 	return nil
 }
 // schedulingStormOwnerWorkload runs one orchestration or CLI step.
 // Signature: schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool).
 // Why: scheduling storms happen at the pod layer, but safe mitigation needs to
 // operate on the owning deployment or statefulset.
 func schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool) {
 	ns := strings.TrimSpace(pod.Metadata.Namespace)
 	for _, owner := range pod.Metadata.OwnerReferences {
 		switch strings.TrimSpace(owner.Kind) {
 		case "StatefulSet":
 			if name := strings.TrimSpace(owner.Name); name != "" {
 				return startupWorkload{Namespace: ns, Kind: "statefulset", Name: name}, true
 			}
 		case "ReplicaSet":
 			rsName := strings.TrimSpace(owner.Name)
 			if rsName == "" {
 				continue
 			}
 			rsOwner, ok := rsOwners[ns+"/"+rsName]
 			if !ok || strings.TrimSpace(rsOwner.Kind) != "Deployment" || strings.TrimSpace(rsOwner.Name) == "" {
 				continue
 			}
 			return startupWorkload{Namespace: ns, Kind: "deployment", Name: strings.TrimSpace(rsOwner.Name)}, true
 		}
 	}
 	return startupWorkload{}, false
 }
 // eventObservationCount runs one orchestration or CLI step.
 // Signature: eventObservationCount(event eventResource) int.
 // Why: event count can live either on the root event or in the series payload;
 // using the max keeps detection stable across Kubernetes versions.
 func eventObservationCount(event eventResource) int {
 	count := event.Count
 	if event.Series.Count > count {
 		count = event.Series.Count
 	}
 	if count < 1 {
 		return 1
 	}
 	return count
 }
 // eventLastObservedAt runs one orchestration or CLI step.
 // Signature: eventLastObservedAt(event eventResource) time.Time.
 // Why: event recency fields vary by cluster version; prefer the newest explicit
 // observation time and fall back to creation time when needed.
 func eventLastObservedAt(event eventResource) time.Time {
 	switch {
 	case !event.Series.LastObservedTime.IsZero():
 		return event.Series.LastObservedTime
 	case !event.LastTimestamp.IsZero():
 		return event.LastTimestamp
 	case !event.EventTime.IsZero():
 		return event.EventTime
 	default:
 		return event.Metadata.CreationTimestamp
 	}
 }
--- a/internal/cluster/orchestrator_shutdown_mode.go
+++ b/internal/cluster/orchestrator_shutdown_mode.go
@ -1,21 +0,0 @@
 package cluster
 import (
 	"fmt"
 	"strings"
 )
 // normalizeShutdownMode runs one orchestration or CLI step.
 // Signature: normalizeShutdownMode(raw string) (string, error).
 // Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
 // semantics while preserving compatibility with legacy "config" callers.
 func normalizeShutdownMode(raw string) (string, error) {
 	switch strings.TrimSpace(raw) {
 	case "", "config", "cluster-only":
 		return "cluster-only", nil
 	case "poweroff":
 		return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
 	default:
 		return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
 	}
 }
--- a/internal/cluster/orchestrator_startup_scope.go
+++ b/internal/cluster/orchestrator_startup_scope.go
@ -1,81 +0,0 @@
 package cluster
 import "strings"
 // startupRequiredNodes runs one orchestration or CLI step.
 // Signature: startupRequiredNodes(nodes []string, required []string) []string.
 // Why: lets startup enforce a smaller core node set during outage recovery
 // without losing the stricter all-nodes behavior when no override is configured.
 func startupRequiredNodes(nodes []string, required []string) []string {
 	requiredSet := makeStringSet(required)
 	if len(requiredSet) == 0 {
 		return nodes
 	}
 	filtered := make([]string, 0, len(nodes))
 	for _, node := range nodes {
 		node = strings.TrimSpace(node)
 		if node == "" {
 			continue
 		}
 		if _, ok := requiredSet[node]; ok {
 			filtered = append(filtered, node)
 		}
 	}
 	return filtered
 }
 // startupNodeStrictlyRequired runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) startupNodeStrictlyRequired(node string) bool.
 // Why: absent or broken non-core nodes should not block recovery-only actions
 // like label reconciliation once the operator has narrowed startup to core nodes.
 func (o *Orchestrator) startupNodeStrictlyRequired(node string) bool {
 	node = strings.TrimSpace(node)
 	if node == "" {
 		return false
 	}
 	if len(o.cfg.Startup.NodeInventoryReachRequiredNodes) == 0 && len(o.cfg.Startup.NodeSSHAuthRequiredNodes) == 0 {
 		return true
 	}
 	for _, controlPlane := range o.cfg.ControlPlanes {
 		if strings.TrimSpace(controlPlane) == node {
 			return true
 		}
 	}
 	if containsNode(o.cfg.Startup.NodeInventoryReachRequiredNodes, node) {
 		return true
 	}
 	return containsNode(o.cfg.Startup.NodeSSHAuthRequiredNodes, node)
 }
 // startupRequiredFluxKustomizations runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{}.
 // Why: lets outage recovery wait on a declared core GitOps slice while leaving
 // optional stacks free to converge after bootstrap succeeds.
 func (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{} {
 	return makeStringSet(o.cfg.Startup.FluxHealthRequiredKustomizations)
 }
 // startupRequiredWorkloadNamespaces runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{}.
 // Why: keeps workload readiness scoped to core namespaces during recovery while
 // preserving broad convergence checks when no explicit core list is configured.
 func (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{} {
 	return makeStringSet(o.cfg.Startup.WorkloadConvergenceRequiredNamespaces)
 }
 // containsNode runs one orchestration or CLI step.
 // Signature: containsNode(entries []string, needle string) bool.
 // Why: keeps node-scope checks small and explicit anywhere startup narrows its
 // recovery gates to a declared core set.
 func containsNode(entries []string, needle string) bool {
 	needle = strings.TrimSpace(needle)
 	if needle == "" {
 		return false
 	}
 	for _, entry := range entries {
 		if strings.TrimSpace(entry) == needle {
 			return true
 		}
 	}
 	return false
 }
--- a/internal/cluster/orchestrator_startup_vault.go
+++ b/internal/cluster/orchestrator_startup_vault.go
@ -1,52 +0,0 @@
 package cluster
 import (
 	"context"
 	"fmt"
 	"time"
 )
 // maybeRunEarlyVaultUnseal runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context).
 // Why: gives startup a best-effort Vault recovery path when the API is already
 // live, without consuming the hard startup failure path before workloads recover.
 func (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context) {
 	if err := o.waitForAPI(ctx, 1, time.Second); err != nil {
 		return
 	}
 	o.noteStartupCheckState("vault-unseal-early", "running", "best-effort early vault unseal while kubernetes api is already available")
 	deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
 	if err != nil {
 		o.log.Printf("warning: early vault unseal deferred: %v", err)
 		o.noteStartupAutoHeal(fmt.Sprintf("deferred early vault unseal: %v", err))
 		return
 	}
 	if deferred {
 		o.log.Printf("vault early unseal deferred: %s", detail)
 		o.noteStartupAutoHeal(detail)
 		return
 	}
 	o.noteStartupCheck("vault-unseal-early", true, "vault is already unsealed")
 }
 // runStartupVaultUnsealGate runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error.
 // Why: keeps the top-level startup flow readable while allowing Vault unseal to
 // defer cleanly until critical workload recovery when the pod is not runnable yet.
 func (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error {
 	o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
 	deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
 	if err != nil {
 		o.noteStartupCheck("vault-unseal", false, err.Error())
 		return err
 	}
 	if deferred {
 		o.log.Printf("vault unseal deferred until workload recovery: %s", detail)
 		o.noteStartupAutoHeal(detail)
 		o.noteStartupCheck("vault-unseal", true, detail)
 		return nil
 	}
 	o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
 	return nil
 }
--- a/internal/cluster/orchestrator_storage_types.go
+++ b/internal/cluster/orchestrator_storage_types.go
@ -177,46 +177,6 @@ type jobConditionRef struct {
 	Status string `json:"status"`
 }
 type eventList struct {
 	Items []eventResource `json:"items"`
 }
 type eventResource struct {
 	Metadata struct {
 		Namespace         string    `json:"namespace"`
 		CreationTimestamp time.Time `json:"creationTimestamp"`
 	} `json:"metadata"`
 	InvolvedObject struct {
 		Kind      string `json:"kind"`
 		Namespace string `json:"namespace"`
 		Name      string `json:"name"`
 	} `json:"involvedObject"`
 	Type          string      `json:"type"`
 	Reason        string      `json:"reason"`
 	Message       string      `json:"message"`
 	Count         int         `json:"count"`
 	EventTime     time.Time   `json:"eventTime"`
 	LastTimestamp time.Time   `json:"lastTimestamp"`
 	Series        eventSeries `json:"series"`
 }
 type eventSeries struct {
 	Count            int       `json:"count"`
 	LastObservedTime time.Time `json:"lastObservedTime"`
 }
 type replicaSetList struct {
 	Items []replicaSetResource `json:"items"`
 }
 type replicaSetResource struct {
 	Metadata struct {
 		Namespace       string           `json:"namespace"`
 		Name            string           `json:"name"`
 		OwnerReferences []ownerReference `json:"ownerReferences"`
 	} `json:"metadata"`
 }
 type workloadResource struct {
 	Kind     string `json:"kind"`
 	Metadata struct {
@ -261,7 +221,6 @@ type podResource struct {
 type ownerReference struct {
 	Kind string `json:"kind"`
 	Name string `json:"name"`
 }
 type podContainerStatus struct {
--- a/internal/cluster/orchestrator_workload_convergence.go
+++ b/internal/cluster/orchestrator_workload_convergence.go
@ -26,12 +26,10 @@ func (o *Orchestrator) waitForWorkloadConvergence(ctx context.Context) error {
 	lastLogged := time.Time{}
 	lastRecycleAttempt := time.Time{}
 	lastReplicaHeal := time.Time{}
 	lastSchedulingStormHeal := time.Time{}
 	for {
 		prevFailure := lastFailure
 		o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
 		o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
 		o.maybeAutoQuarantineSchedulingStorms(ctx, &lastSchedulingStormHeal)
 		ready, detail, err := o.workloadConvergenceReady(ctx)
 		if err != nil {
 			lastFailure = err.Error()
@ -73,7 +71,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
 	if err := json.Unmarshal([]byte(out), &list); err != nil {
 		return false, "", fmt.Errorf("decode controllers: %w", err)
 	}
 	requiredNamespaces := o.startupRequiredWorkloadNamespaces()
 	ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
 	ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
@ -87,11 +84,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
 		if kind == "" || ns == "" || name == "" {
 			continue
 		}
 		if len(requiredNamespaces) > 0 {
 			if _, ok := requiredNamespaces[ns]; !ok {
 				continue
 			}
 		}
 		if _, ok := ignoredNamespaces[ns]; ok {
 			continue
 		}
--- a/internal/cluster/orchestrator_workload_ignore.go
+++ b/internal/cluster/orchestrator_workload_ignore.go
@ -116,7 +116,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
 		return nil, fmt.Errorf("decode pods: %w", err)
 	}
 	requiredNamespaces := o.startupRequiredWorkloadNamespaces()
 	ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
 	ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	stuckReasons := map[string]struct{}{
@ -139,11 +138,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
 		if ns == "" || name == "" {
 			continue
 		}
 		if len(requiredNamespaces) > 0 {
 			if _, ok := requiredNamespaces[ns]; !ok {
 				continue
 			}
 		}
 		if _, ok := ignoredNamespaces[ns]; ok {
 			continue
 		}
--- a/internal/cluster/testing_hooks_scheduling_storm.go
+++ b/internal/cluster/testing_hooks_scheduling_storm.go
@ -1,88 +0,0 @@
 package cluster
 import (
 	"context"
 	"fmt"
 	"strings"
 	"time"
 )
 // TestHookMaybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
 // Why: exposes the scheduling-storm trigger guard to the split top-level test module.
 func (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
 	o.maybeAutoQuarantineSchedulingStorms(ctx, lastAttempt)
 }
 // TestHookQuarantineSchedulingStormWorkloads runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error.
 // Why: exposes the scheduling-storm auto-heal body to the split top-level test module.
 func (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error {
 	return o.quarantineSchedulingStormWorkloads(ctx)
 }
 // TestHookSchedulingStormOwnerWorkload runs one orchestration or CLI step.
 // Signature: TestHookSchedulingStormOwnerWorkload(namespace string, ownerKind string, ownerName string, rsOwnerKind string, rsOwnerName string) (string, bool).
 // Why: exposes owner-resolution behavior without leaking internal workload types.
 func TestHookSchedulingStormOwnerWorkload(
 	namespace string,
 	ownerKind string,
 	ownerName string,
 	rsOwnerKind string,
 	rsOwnerName string,
 ) (string, bool) {
 	var pod podResource
 	pod.Metadata.Namespace = strings.TrimSpace(namespace)
 	pod.Metadata.OwnerReferences = []ownerReference{{
 		Kind: strings.TrimSpace(ownerKind),
 		Name: strings.TrimSpace(ownerName),
 	}}
 	rsOwners := map[string]ownerReference{}
 	if rsName := strings.TrimSpace(ownerName); rsName != "" && strings.TrimSpace(ownerKind) == "ReplicaSet" {
 		rsOwners[pod.Metadata.Namespace+"/"+rsName] = ownerReference{
 			Kind: strings.TrimSpace(rsOwnerKind),
 			Name: strings.TrimSpace(rsOwnerName),
 		}
 	}
 	workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
 	if !ok {
 		return "", false
 	}
 	return fmt.Sprintf("%s/%s/%s", workload.Namespace, workload.Kind, workload.Name), true
 }
 // TestHookEventObservationCount runs one orchestration or CLI step.
 // Signature: TestHookEventObservationCount(count int, seriesCount int) int.
 // Why: exposes event-count normalization used by scheduling-storm detection.
 func TestHookEventObservationCount(count int, seriesCount int) int {
 	return eventObservationCount(eventResource{
 		Count: count,
 		Series: eventSeries{
 			Count: seriesCount,
 		},
 	})
 }
 // TestHookEventLastObservedAt runs one orchestration or CLI step.
 // Signature: TestHookEventLastObservedAt(seriesLastObserved time.Time, lastTimestamp time.Time, eventTime time.Time, creationTimestamp time.Time) time.Time.
 // Why: exposes event-time fallback behavior used by scheduling-storm detection.
 func TestHookEventLastObservedAt(
 	seriesLastObserved time.Time,
 	lastTimestamp time.Time,
 	eventTime time.Time,
 	creationTimestamp time.Time,
 ) time.Time {
 	return eventLastObservedAt(eventResource{
 		LastTimestamp: lastTimestamp,
 		EventTime:     eventTime,
 		Series: eventSeries{
 			LastObservedTime: seriesLastObserved,
 		},
 		Metadata: struct {
 			Namespace         string    `json:"namespace"`
 			CreationTimestamp time.Time `json:"creationTimestamp"`
 		}{
 			CreationTimestamp: creationTimestamp,
 		},
 	})
 }
--- a/internal/cluster/testing_hooks_startup.go
+++ b/internal/cluster/testing_hooks_startup.go
@ -1,55 +0,0 @@
 package cluster
 import "context"
 // TestHookStartupRequiredNodes runs one orchestration or CLI step.
 // Signature: TestHookStartupRequiredNodes(nodes []string, required []string) []string.
 // Why: exposes recovery-scope node filtering so top-level tests can cover core-only startup narrowing.
 func TestHookStartupRequiredNodes(nodes []string, required []string) []string {
 	return startupRequiredNodes(nodes, required)
 }
 // TestHookContainsNode runs one orchestration or CLI step.
 // Signature: TestHookContainsNode(entries []string, needle string) bool.
 // Why: exposes the small startup-scope membership helper to top-level tests.
 func TestHookContainsNode(entries []string, needle string) bool {
 	return containsNode(entries, needle)
 }
 // TestHookStartupNodeStrictlyRequired runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool.
 // Why: exposes strict-node startup scoping so outage-recovery tests can confirm
 // non-core nodes stop blocking bootstrap.
 func (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool {
 	return o.startupNodeStrictlyRequired(node)
 }
 // TestHookStartupRequiredFluxKustomizations runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{}.
 // Why: exposes flux startup scoping so top-level tests can confirm only core
 // kustomizations block emergency bootstrap.
 func (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{} {
 	return o.startupRequiredFluxKustomizations()
 }
 // TestHookStartupRequiredWorkloadNamespaces runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{}.
 // Why: exposes workload namespace startup scoping so top-level tests can
 // confirm only core workloads block emergency bootstrap.
 func (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{} {
 	return o.startupRequiredWorkloadNamespaces()
 }
 // TestHookMaybeRunEarlyVaultUnseal runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context).
 // Why: exposes the early startup Vault deferral helper to top-level tests.
 func (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context) {
 	o.maybeRunEarlyVaultUnseal(ctx)
 }
 // TestHookRunStartupVaultUnsealGate runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error.
 // Why: exposes the startup Vault gate helper to top-level tests.
 func (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error {
 	return o.runStartupVaultUnsealGate(ctx)
 }
--- a/internal/config/apply_defaults.go
+++ b/internal/config/apply_defaults.go
@ -33,9 +33,6 @@ func (c *Config) applyDefaults() {
 	if c.Startup.NodeInventoryReachPollSeconds <= 0 {
 		c.Startup.NodeInventoryReachPollSeconds = 5
 	}
 	if c.Startup.NodeInventoryReachRequiredNodes == nil {
 		c.Startup.NodeInventoryReachRequiredNodes = []string{}
 	}
 	if c.Startup.RequiredNodeLabels == nil {
 		c.Startup.RequiredNodeLabels = map[string]map[string]string{
 			"titan-09": {
@ -124,11 +121,7 @@ func (c *Config) applyDefaults() {
 	if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
 		c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
 	}
-	if c.Startup.ServiceChecklistExplicitOnly {
+	c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
 		c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, []ServiceChecklistCheck{})
 	} else {
 		c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
 	}
 	for i := range c.Startup.ServiceChecklist {
 		if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
 			c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
@ -159,18 +152,12 @@ func (c *Config) applyDefaults() {
 	if c.Startup.NodeSSHAuthPollSeconds <= 0 {
 		c.Startup.NodeSSHAuthPollSeconds = 5
 	}
 	if c.Startup.NodeSSHAuthRequiredNodes == nil {
 		c.Startup.NodeSSHAuthRequiredNodes = []string{}
 	}
 	if c.Startup.FluxHealthWaitSeconds <= 0 {
 		c.Startup.FluxHealthWaitSeconds = 900
 	}
 	if c.Startup.FluxHealthPollSeconds <= 0 {
 		c.Startup.FluxHealthPollSeconds = 5
 	}
 	if c.Startup.FluxHealthRequiredKustomizations == nil {
 		c.Startup.FluxHealthRequiredKustomizations = []string{}
 	}
 	if c.Startup.IgnoreFluxKustomizations == nil {
 		c.Startup.IgnoreFluxKustomizations = []string{}
 	}
@ -180,9 +167,6 @@ func (c *Config) applyDefaults() {
 	if c.Startup.WorkloadConvergencePollSeconds <= 0 {
 		c.Startup.WorkloadConvergencePollSeconds = 5
 	}
 	if c.Startup.WorkloadConvergenceRequiredNamespaces == nil {
 		c.Startup.WorkloadConvergenceRequiredNamespaces = []string{}
 	}
 	if c.Startup.IgnoreWorkloadNamespaces == nil {
 		c.Startup.IgnoreWorkloadNamespaces = []string{}
 	}
@ -195,12 +179,6 @@ func (c *Config) applyDefaults() {
 	if c.Startup.StuckPodGraceSeconds <= 0 {
 		c.Startup.StuckPodGraceSeconds = 180
 	}
 	if c.Startup.PostStartAutoHealSeconds <= 0 {
 		c.Startup.PostStartAutoHealSeconds = 60
 	}
 	if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
 		c.Startup.DeadNodeCleanupGraceSeconds = 300
 	}
 	if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
 		c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
 	}
@ -243,12 +221,6 @@ func (c *Config) applyDefaults() {
 	if c.UPS.TelemetryTimeoutSeconds <= 0 {
 		c.UPS.TelemetryTimeoutSeconds = 90
 	}
 	if c.Startup.SchedulingStormEventThreshold <= 0 {
 		c.Startup.SchedulingStormEventThreshold = 30
 	}
 	if c.Startup.SchedulingStormWindowSeconds <= 0 {
 		c.Startup.SchedulingStormWindowSeconds = 180
 	}
 	if c.Coordination.ForwardShutdownConfig == "" {
 		c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
 	}
--- a/internal/config/defaults.go
+++ b/internal/config/defaults.go
@ -39,25 +39,24 @@ func defaults() Config {
 			"maintenance",
 		},
 		Startup: Startup{
-			APIWaitSeconds:                  1200,
+			APIWaitSeconds:                1200,
-			APIPollSeconds:                  2,
+			APIPollSeconds:                2,
-			ShutdownCooldownSeconds:         45,
+			ShutdownCooldownSeconds:       45,
-			RequireNodeInventoryReach:       true,
+			RequireNodeInventoryReach:     true,
-			NodeInventoryReachWaitSeconds:   300,
+			NodeInventoryReachWaitSeconds: 300,
-			NodeInventoryReachPollSeconds:   5,
+			NodeInventoryReachPollSeconds: 5,
-			NodeInventoryReachRequiredNodes: []string{},
+			RequireTimeSync:               true,
-			RequireTimeSync:                 true,
+			TimeSyncWaitSeconds:           240,
-			TimeSyncWaitSeconds:             240,
+			TimeSyncPollSeconds:           5,
-			TimeSyncPollSeconds:             5,
+			TimeSyncMode:                  "quorum",
-			TimeSyncMode:                    "quorum",
+			TimeSyncQuorum:                2,
-			TimeSyncQuorum:                  2,
+			ReconcileAccessOnBoot:         true,
-			ReconcileAccessOnBoot:           true,
+			AutoEtcdRestoreOnAPIFailure:   true,
-			AutoEtcdRestoreOnAPIFailure:     true,
+			EtcdRestoreControlPlane:       "titan-0a",
-			EtcdRestoreControlPlane:         "titan-0a",
+			RequireStorageReady:           true,
-			RequireStorageReady:             true,
+			StorageReadyWaitSeconds:       420,
-			StorageReadyWaitSeconds:         420,
+			StorageReadyPollSeconds:       5,
-			StorageReadyPollSeconds:         5,
+			StorageMinReadyNodes:          2,
 			StorageMinReadyNodes:            2,
 			StorageCriticalPVCs: []string{
 				"vault/data-vault-0",
 				"postgres/postgres-data-postgres-0",
@ -92,36 +91,33 @@ func defaults() Config {
 				AdminSecretUsernameKey: "username",
 				AdminSecretPasswordKey: "password",
 			},
-			ServiceChecklist:                      defaultServiceChecklist(),
+			ServiceChecklist:                defaultServiceChecklist(),
-			RequireCriticalServiceEndpoints:       true,
+			RequireCriticalServiceEndpoints: true,
-			CriticalServiceEndpointWaitSec:        420,
+			CriticalServiceEndpointWaitSec:  420,
-			CriticalServiceEndpointPollSec:        5,
+			CriticalServiceEndpointPollSec:  5,
-			CriticalServiceEndpoints:              defaultCriticalServiceEndpoints(),
+			CriticalServiceEndpoints:        defaultCriticalServiceEndpoints(),
-			RequireIngressChecklist:               true,
+			RequireIngressChecklist:         true,
-			IngressChecklistWaitSeconds:           420,
+			IngressChecklistWaitSeconds:     420,
-			IngressChecklistPollSeconds:           5,
+			IngressChecklistPollSeconds:     5,
-			IngressChecklistAccepted:              []int{200, 301, 302, 307, 308, 401, 403, 404},
+			IngressChecklistAccepted:        []int{200, 301, 302, 307, 308, 401, 403, 404},
-			IngressChecklistIgnoreHosts:           []string{},
+			IngressChecklistIgnoreHosts:     []string{},
-			RequireNodeSSHAuth:                    true,
+			RequireNodeSSHAuth:              true,
-			NodeSSHAuthWaitSeconds:                240,
+			NodeSSHAuthWaitSeconds:          240,
-			NodeSSHAuthPollSeconds:                5,
+			NodeSSHAuthPollSeconds:          5,
-			NodeSSHAuthRequiredNodes:              []string{},
+			RequireFluxHealth:               true,
-			RequireFluxHealth:                     true,
+			FluxHealthWaitSeconds:           900,
-			FluxHealthWaitSeconds:                 900,
+			FluxHealthPollSeconds:           5,
-			FluxHealthPollSeconds:                 5,
+			IgnoreFluxKustomizations:        []string{},
-			FluxHealthRequiredKustomizations:      []string{},
+			RequireWorkloadConvergence:      true,
-			IgnoreFluxKustomizations:              []string{},
+			WorkloadConvergenceWaitSeconds:  900,
-			RequireWorkloadConvergence:            true,
+			WorkloadConvergencePollSeconds:  5,
-			WorkloadConvergenceWaitSeconds:        900,
+			IgnoreWorkloadNamespaces:        []string{},
-			WorkloadConvergencePollSeconds:        5,
+			IgnoreWorkloads:                 []string{},
-			WorkloadConvergenceRequiredNamespaces: []string{},
+			IgnoreUnavailableNodes:          []string{},
-			IgnoreWorkloadNamespaces:              []string{},
+			AutoRecycleStuckPods:            true,
-			IgnoreWorkloads:                       []string{},
+			StuckPodGraceSeconds:            180,
-			IgnoreUnavailableNodes:                []string{},
+			VaultUnsealKeyFile:              "/var/lib/ananke/vault-unseal.key",
-			AutoRecycleStuckPods:                  true,
+			VaultUnsealBreakglassTimeout:    15,
 			StuckPodGraceSeconds:                  180,
 			VaultUnsealKeyFile:                    "/var/lib/ananke/vault-unseal.key",
 			VaultUnsealBreakglassTimeout:          15,
 		},
 		Shutdown: Shutdown{
 			DefaultBudgetSeconds: 1380,
--- a/internal/config/load_additional_test.go
+++ b/internal/config/load_additional_test.go
@ -51,41 +51,3 @@ startup:
 		t.Fatalf("expected validation failure")
 	}
 }
 // TestLoadKeepsExplicitServiceChecklist runs one orchestration or CLI step.
 // Signature: TestLoadKeepsExplicitServiceChecklist(t *testing.T).
 // Why: host recovery configs must be able to keep a narrow, explicit checklist
 // without silently inheriting the full default service catalog.
 func TestLoadKeepsExplicitServiceChecklist(t *testing.T) {
 	cfgPath := filepath.Join(t.TempDir(), "ananke.yaml")
 	raw := `
 control_planes: [titan-0a]
 expected_flux_branch: main
 expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
 iac_repo_path: /opt/titan-iac
 startup:
  service_checklist_explicit_only: true
  service_checklist:
    - name: gitea-api
      url: https://scm.bstein.dev/api/healthz
      accepted_statuses: [200]
      body_contains: pass
      timeout_seconds: 12
 ups:
  enabled: false
 `
 	if err := os.WriteFile(cfgPath, []byte(raw), 0o644); err != nil {
 		t.Fatalf("write config: %v", err)
 	}
 	cfg, err := Load(cfgPath)
 	if err != nil {
 		t.Fatalf("load config: %v", err)
 	}
 	if len(cfg.Startup.ServiceChecklist) != 1 {
 		t.Fatalf("expected 1 explicit service check, got %d", len(cfg.Startup.ServiceChecklist))
 	}
 	if cfg.Startup.ServiceChecklist[0].Name != "gitea-api" {
 		t.Fatalf("expected explicit gitea-api check, got %q", cfg.Startup.ServiceChecklist[0].Name)
 	}
 }
--- a/internal/config/types.go
+++ b/internal/config/types.go
@ -27,75 +27,65 @@ type Config struct {
 }
 type Startup struct {
-	APIWaitSeconds                        int                          `yaml:"api_wait_seconds"`
+	APIWaitSeconds                  int                          `yaml:"api_wait_seconds"`
-	APIPollSeconds                        int                          `yaml:"api_poll_seconds"`
+	APIPollSeconds                  int                          `yaml:"api_poll_seconds"`
-	ShutdownCooldownSeconds               int                          `yaml:"shutdown_cooldown_seconds"`
+	ShutdownCooldownSeconds         int                          `yaml:"shutdown_cooldown_seconds"`
-	MinimumBatteryPercent                 float64                      `yaml:"minimum_battery_percent"`
+	MinimumBatteryPercent           float64                      `yaml:"minimum_battery_percent"`
-	RequireNodeInventoryReach             bool                         `yaml:"require_node_inventory_reachability"`
+	RequireNodeInventoryReach       bool                         `yaml:"require_node_inventory_reachability"`
-	NodeInventoryReachWaitSeconds         int                          `yaml:"node_inventory_reachability_wait_seconds"`
+	NodeInventoryReachWaitSeconds   int                          `yaml:"node_inventory_reachability_wait_seconds"`
-	NodeInventoryReachPollSeconds         int                          `yaml:"node_inventory_reachability_poll_seconds"`
+	NodeInventoryReachPollSeconds   int                          `yaml:"node_inventory_reachability_poll_seconds"`
-	NodeInventoryReachRequiredNodes       []string                     `yaml:"node_inventory_reachability_required_nodes"`
+	RequiredNodeLabels              map[string]map[string]string `yaml:"required_node_labels"`
-	RequiredNodeLabels                    map[string]map[string]string `yaml:"required_node_labels"`
+	RequireTimeSync                 bool                         `yaml:"require_time_sync"`
-	RequireTimeSync                       bool                         `yaml:"require_time_sync"`
+	TimeSyncWaitSeconds             int                          `yaml:"time_sync_wait_seconds"`
-	TimeSyncWaitSeconds                   int                          `yaml:"time_sync_wait_seconds"`
+	TimeSyncPollSeconds             int                          `yaml:"time_sync_poll_seconds"`
-	TimeSyncPollSeconds                   int                          `yaml:"time_sync_poll_seconds"`
+	TimeSyncMode                    string                       `yaml:"time_sync_mode"`
-	TimeSyncMode                          string                       `yaml:"time_sync_mode"`
+	TimeSyncQuorum                  int                          `yaml:"time_sync_quorum"`
-	TimeSyncQuorum                        int                          `yaml:"time_sync_quorum"`
+	ReconcileAccessOnBoot           bool                         `yaml:"reconcile_access_on_boot"`
-	ReconcileAccessOnBoot                 bool                         `yaml:"reconcile_access_on_boot"`
+	AutoEtcdRestoreOnAPIFailure     bool                         `yaml:"auto_etcd_restore_on_api_failure"`
-	AutoEtcdRestoreOnAPIFailure           bool                         `yaml:"auto_etcd_restore_on_api_failure"`
+	EtcdRestoreControlPlane         string                       `yaml:"etcd_restore_control_plane"`
-	EtcdRestoreControlPlane               string                       `yaml:"etcd_restore_control_plane"`
+	RequireStorageReady             bool                         `yaml:"require_storage_ready"`
-	RequireStorageReady                   bool                         `yaml:"require_storage_ready"`
+	StorageReadyWaitSeconds         int                          `yaml:"storage_ready_wait_seconds"`
-	StorageReadyWaitSeconds               int                          `yaml:"storage_ready_wait_seconds"`
+	StorageReadyPollSeconds         int                          `yaml:"storage_ready_poll_seconds"`
-	StorageReadyPollSeconds               int                          `yaml:"storage_ready_poll_seconds"`
+	StorageMinReadyNodes            int                          `yaml:"storage_min_ready_nodes"`
-	StorageMinReadyNodes                  int                          `yaml:"storage_min_ready_nodes"`
+	StorageCriticalPVCs             []string                     `yaml:"storage_critical_pvcs"`
-	StorageCriticalPVCs                   []string                     `yaml:"storage_critical_pvcs"`
+	RequirePostStartProbes          bool                         `yaml:"require_post_start_probes"`
-	RequirePostStartProbes                bool                         `yaml:"require_post_start_probes"`
+	PostStartProbeWaitSeconds       int                          `yaml:"post_start_probe_wait_seconds"`
-	PostStartProbeWaitSeconds             int                          `yaml:"post_start_probe_wait_seconds"`
+	PostStartProbePollSeconds       int                          `yaml:"post_start_probe_poll_seconds"`
-	PostStartProbePollSeconds             int                          `yaml:"post_start_probe_poll_seconds"`
+	PostStartProbes                 []string                     `yaml:"post_start_probes"`
-	PostStartProbes                       []string                     `yaml:"post_start_probes"`
+	RequireServiceChecklist         bool                         `yaml:"require_service_checklist"`
-	RequireServiceChecklist               bool                         `yaml:"require_service_checklist"`
+	ServiceChecklistWaitSeconds     int                          `yaml:"service_checklist_wait_seconds"`
-	ServiceChecklistWaitSeconds           int                          `yaml:"service_checklist_wait_seconds"`
+	ServiceChecklistPollSeconds     int                          `yaml:"service_checklist_poll_seconds"`
-	ServiceChecklistPollSeconds           int                          `yaml:"service_checklist_poll_seconds"`
+	ServiceChecklistStabilitySec    int                          `yaml:"service_checklist_stability_seconds"`
-	ServiceChecklistStabilitySec          int                          `yaml:"service_checklist_stability_seconds"`
+	ServiceChecklistAuth            ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
-	ServiceChecklistAuth                  ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
+	ServiceChecklist                []ServiceChecklistCheck      `yaml:"service_checklist"`
-	ServiceChecklistExplicitOnly          bool                         `yaml:"service_checklist_explicit_only"`
+	RequireCriticalServiceEndpoints bool                         `yaml:"require_critical_service_endpoints"`
-	ServiceChecklist                      []ServiceChecklistCheck      `yaml:"service_checklist"`
+	CriticalServiceEndpointWaitSec  int                          `yaml:"critical_service_endpoint_wait_seconds"`
-	RequireCriticalServiceEndpoints       bool                         `yaml:"require_critical_service_endpoints"`
+	CriticalServiceEndpointPollSec  int                          `yaml:"critical_service_endpoint_poll_seconds"`
-	CriticalServiceEndpointWaitSec        int                          `yaml:"critical_service_endpoint_wait_seconds"`
+	CriticalServiceEndpoints        []string                     `yaml:"critical_service_endpoints"`
-	CriticalServiceEndpointPollSec        int                          `yaml:"critical_service_endpoint_poll_seconds"`
+	RequireIngressChecklist         bool                         `yaml:"require_ingress_checklist"`
-	CriticalServiceEndpoints              []string                     `yaml:"critical_service_endpoints"`
+	IngressChecklistWaitSeconds     int                          `yaml:"ingress_checklist_wait_seconds"`
-	RequireIngressChecklist               bool                         `yaml:"require_ingress_checklist"`
+	IngressChecklistPollSeconds     int                          `yaml:"ingress_checklist_poll_seconds"`
-	IngressChecklistWaitSeconds           int                          `yaml:"ingress_checklist_wait_seconds"`
+	IngressChecklistAccepted        []int                        `yaml:"ingress_checklist_accepted_statuses"`
-	IngressChecklistPollSeconds           int                          `yaml:"ingress_checklist_poll_seconds"`
+	IngressChecklistIgnoreHosts     []string                     `yaml:"ingress_checklist_ignore_hosts"`
-	IngressChecklistAccepted              []int                        `yaml:"ingress_checklist_accepted_statuses"`
+	IngressChecklistInsecureSkip    bool                         `yaml:"ingress_checklist_insecure_skip_tls"`
-	IngressChecklistIgnoreHosts           []string                     `yaml:"ingress_checklist_ignore_hosts"`
+	RequireNodeSSHAuth              bool                         `yaml:"require_node_ssh_auth"`
-	IngressChecklistInsecureSkip          bool                         `yaml:"ingress_checklist_insecure_skip_tls"`
+	NodeSSHAuthWaitSeconds          int                          `yaml:"node_ssh_auth_wait_seconds"`
-	RequireNodeSSHAuth                    bool                         `yaml:"require_node_ssh_auth"`
+	NodeSSHAuthPollSeconds          int                          `yaml:"node_ssh_auth_poll_seconds"`
-	NodeSSHAuthWaitSeconds                int                          `yaml:"node_ssh_auth_wait_seconds"`
+	RequireFluxHealth               bool                         `yaml:"require_flux_health"`
-	NodeSSHAuthPollSeconds                int                          `yaml:"node_ssh_auth_poll_seconds"`
+	FluxHealthWaitSeconds           int                          `yaml:"flux_health_wait_seconds"`
-	NodeSSHAuthRequiredNodes              []string                     `yaml:"node_ssh_auth_required_nodes"`
+	FluxHealthPollSeconds           int                          `yaml:"flux_health_poll_seconds"`
-	RequireFluxHealth                     bool                         `yaml:"require_flux_health"`
+	IgnoreFluxKustomizations        []string                     `yaml:"ignore_flux_kustomizations"`
-	FluxHealthWaitSeconds                 int                          `yaml:"flux_health_wait_seconds"`
+	RequireWorkloadConvergence      bool                         `yaml:"require_workload_convergence"`
-	FluxHealthPollSeconds                 int                          `yaml:"flux_health_poll_seconds"`
+	WorkloadConvergenceWaitSeconds  int                          `yaml:"workload_convergence_wait_seconds"`
-	FluxHealthRequiredKustomizations      []string                     `yaml:"flux_health_required_kustomizations"`
+	WorkloadConvergencePollSeconds  int                          `yaml:"workload_convergence_poll_seconds"`
-	IgnoreFluxKustomizations              []string                     `yaml:"ignore_flux_kustomizations"`
+	IgnoreWorkloadNamespaces        []string                     `yaml:"ignore_workload_namespaces"`
-	RequireWorkloadConvergence            bool                         `yaml:"require_workload_convergence"`
+	IgnoreWorkloads                 []string                     `yaml:"ignore_workloads"`
-	WorkloadConvergenceWaitSeconds        int                          `yaml:"workload_convergence_wait_seconds"`
+	IgnoreUnavailableNodes          []string                     `yaml:"ignore_unavailable_nodes"`
-	WorkloadConvergencePollSeconds        int                          `yaml:"workload_convergence_poll_seconds"`
+	AutoRecycleStuckPods            bool                         `yaml:"auto_recycle_stuck_pods"`
-	WorkloadConvergenceRequiredNamespaces []string                     `yaml:"workload_convergence_required_namespaces"`
+	StuckPodGraceSeconds            int                          `yaml:"stuck_pod_grace_seconds"`
-	IgnoreWorkloadNamespaces              []string                     `yaml:"ignore_workload_namespaces"`
+	VaultUnsealKeyFile              string                       `yaml:"vault_unseal_key_file"`
-	IgnoreWorkloads                       []string                     `yaml:"ignore_workloads"`
+	VaultUnsealBreakglassCommand    string                       `yaml:"vault_unseal_breakglass_command"`
-	IgnoreUnavailableNodes                []string                     `yaml:"ignore_unavailable_nodes"`
+	VaultUnsealBreakglassTimeout    int                          `yaml:"vault_unseal_breakglass_timeout_seconds"`
 	AutoRecycleStuckPods                  bool                         `yaml:"auto_recycle_stuck_pods"`
 	AutoQuarantineSchedulingStorms        bool                         `yaml:"auto_quarantine_scheduling_storms"`
 	SchedulingStormEventThreshold         int                          `yaml:"scheduling_storm_event_threshold"`
 	SchedulingStormWindowSeconds          int                          `yaml:"scheduling_storm_window_seconds"`
 	StuckPodGraceSeconds                  int                          `yaml:"stuck_pod_grace_seconds"`
 	PostStartAutoHealSeconds              int                          `yaml:"post_start_auto_heal_seconds"`
 	DeadNodeCleanupGraceSeconds           int                          `yaml:"dead_node_cleanup_grace_seconds"`
 	VaultUnsealKeyFile                    string                       `yaml:"vault_unseal_key_file"`
 	VaultUnsealBreakglassCommand          string                       `yaml:"vault_unseal_breakglass_command"`
 	VaultUnsealBreakglassTimeout          int                          `yaml:"vault_unseal_breakglass_timeout_seconds"`
 }
 type ServiceChecklistCheck struct {
@ -146,7 +136,6 @@ type UPS struct {
 	Targets                 []UPSTarget `yaml:"targets"`
 	PollSeconds             int         `yaml:"poll_seconds"`
 	RuntimeSafetyFactor     float64     `yaml:"runtime_safety_factor"`
 	OnBatteryGraceSeconds   int         `yaml:"on_battery_grace_seconds"`
 	DebounceCount           int         `yaml:"debounce_count"`
 	TelemetryTimeoutSeconds int         `yaml:"telemetry_timeout_seconds"`
 }
--- a/internal/config/validate.go
+++ b/internal/config/validate.go
@ -61,11 +61,6 @@ func (c Config) Validate() error {
 	if c.Startup.NodeInventoryReachPollSeconds <= 0 {
 		return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
 	}
 	for _, node := range c.Startup.NodeInventoryReachRequiredNodes {
 		if strings.TrimSpace(node) == "" {
 			return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
 		}
 	}
 	for node, labels := range c.Startup.RequiredNodeLabels {
 		if strings.TrimSpace(node) == "" {
 			return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
@ -238,46 +233,21 @@ func (c Config) Validate() error {
 	if c.Startup.NodeSSHAuthPollSeconds <= 0 {
 		return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
 	}
 	for _, node := range c.Startup.NodeSSHAuthRequiredNodes {
 		if strings.TrimSpace(node) == "" {
 			return fmt.Errorf("config.startup.node_ssh_auth_required_nodes entries must not be empty")
 		}
 	}
 	if c.Startup.FluxHealthWaitSeconds <= 0 {
 		return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
 	}
 	if c.Startup.FluxHealthPollSeconds <= 0 {
 		return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
 	}
 	for _, item := range c.Startup.FluxHealthRequiredKustomizations {
 		item = strings.TrimSpace(item)
 		if item == "" {
 			return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must not be empty")
 		}
 		if strings.Count(item, "/") != 1 {
 			return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must be namespace/name, got %q", item)
 		}
 	}
 	if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
 		return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
 	}
 	if c.Startup.WorkloadConvergencePollSeconds <= 0 {
 		return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
 	}
 	for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
 		if strings.TrimSpace(ns) == "" {
 			return fmt.Errorf("config.startup.workload_convergence_required_namespaces entries must not be empty")
 		}
 	}
 	if c.Startup.StuckPodGraceSeconds <= 0 {
 		return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
 	}
 	if c.Startup.PostStartAutoHealSeconds <= 0 {
 		return fmt.Errorf("config.startup.post_start_auto_heal_seconds must be > 0")
 	}
 	if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
 		return fmt.Errorf("config.startup.dead_node_cleanup_grace_seconds must be > 0")
 	}
 	for _, probe := range c.Startup.PostStartProbes {
 		if strings.TrimSpace(probe) == "" {
 			return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
@ -307,16 +277,6 @@ func (c Config) Validate() error {
 			return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
 		}
 	}
 	for _, item := range c.Startup.FluxHealthRequiredKustomizations {
 		if containsTrimmed(c.Startup.IgnoreFluxKustomizations, item) {
 			return fmt.Errorf("config.startup.flux_health_required_kustomizations must not overlap ignore_flux_kustomizations (%q)", strings.TrimSpace(item))
 		}
 	}
 	for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
 		if containsTrimmed(c.Startup.IgnoreWorkloadNamespaces, ns) {
 			return fmt.Errorf("config.startup.workload_convergence_required_namespaces must not overlap ignore_workload_namespaces (%q)", strings.TrimSpace(ns))
 		}
 	}
 	for _, node := range c.Startup.IgnoreUnavailableNodes {
 		if strings.TrimSpace(node) == "" {
 			return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
@ -332,9 +292,6 @@ func (c Config) Validate() error {
 		if c.UPS.Provider == "" {
 			return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
 		}
 		if c.UPS.OnBatteryGraceSeconds < 0 {
 			return fmt.Errorf("config.ups.on_battery_grace_seconds must be >= 0")
 		}
 		if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
 			return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
 		}
@ -349,14 +306,6 @@ func (c Config) Validate() error {
 			return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
 		}
 	}
 	if c.Startup.AutoQuarantineSchedulingStorms {
 		if c.Startup.SchedulingStormEventThreshold <= 0 {
 			return fmt.Errorf("config.startup.scheduling_storm_event_threshold must be > 0 when auto_quarantine_scheduling_storms is enabled")
 		}
 		if c.Startup.SchedulingStormWindowSeconds <= 0 {
 			return fmt.Errorf("config.startup.scheduling_storm_window_seconds must be > 0 when auto_quarantine_scheduling_storms is enabled")
 		}
 	}
 	for _, peer := range c.Coordination.PeerHosts {
 		if strings.TrimSpace(peer) == "" {
 			return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
@ -379,20 +328,3 @@ func (c Config) Validate() error {
 	}
 	return nil
 }
 // containsTrimmed runs one orchestration or CLI step.
 // Signature: containsTrimmed(entries []string, needle string) bool.
 // Why: startup config now supports both required and ignored recovery scopes, so
 // validation needs a single normalized overlap check for those lists.
 func containsTrimmed(entries []string, needle string) bool {
 	needle = strings.TrimSpace(needle)
 	if needle == "" {
 		return false
 	}
 	for _, entry := range entries {
 		if strings.TrimSpace(entry) == needle {
 			return true
 		}
 	}
 	return false
 }
--- a/internal/config/validate_matrix_test.go
+++ b/internal/config/validate_matrix_test.go
@ -30,7 +30,6 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
 		{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
 		{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
 		{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
 		{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
 		{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
 		{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
 		{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
@ -69,42 +68,19 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
 		{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
 		{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
 		{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
 		{"bad_empty_node_ssh_required_node", func(c *Config) { c.Startup.NodeSSHAuthRequiredNodes = []string{"titan-0a", ""} }},
 		{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
 		{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
 		{"bad_empty_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", ""} }},
 		{"bad_malformed_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system-core"} }},
 		{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
 		{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
 		{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
 		{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
 		{"bad_post_start_auto_heal_seconds", func(c *Config) { c.Startup.PostStartAutoHealSeconds = 0 }},
 		{"bad_dead_node_cleanup_grace_seconds", func(c *Config) { c.Startup.DeadNodeCleanupGraceSeconds = 0 }},
 		{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
 		{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
 		{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
 		{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
 		{"bad_overlap_flux_required_and_ignored", func(c *Config) {
 			c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core"}
 			c.Startup.IgnoreFluxKustomizations = []string{"flux-system/core"}
 		}},
 		{"bad_overlap_workload_required_and_ignored", func(c *Config) {
 			c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
 			c.Startup.IgnoreWorkloadNamespaces = []string{"monitoring"}
 		}},
 		{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
 		{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
 		{"bad_scheduling_storm_threshold", func(c *Config) {
 			c.Startup.AutoQuarantineSchedulingStorms = true
 			c.Startup.SchedulingStormEventThreshold = 0
 		}},
 		{"bad_scheduling_storm_window", func(c *Config) {
 			c.Startup.AutoQuarantineSchedulingStorms = true
 			c.Startup.SchedulingStormWindowSeconds = 0
 		}},
 		{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
 		{"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }},
 		{"bad_ups_on_battery_grace_negative", func(c *Config) { c.UPS.Enabled = true; c.UPS.OnBatteryGraceSeconds = -1 }},
 		{"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }},
 		{"bad_ups_targets_item_empty", func(c *Config) {
 			c.UPS.Enabled = true
@ -145,13 +121,6 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) {
 	if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
 		t.Fatalf("expected startup defaults to be set")
 	}
 	if cfg.Startup.PostStartAutoHealSeconds <= 0 || cfg.Startup.DeadNodeCleanupGraceSeconds <= 0 {
 		t.Fatalf("expected post-start auto-heal defaults to be set")
 	}
 	if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
 		cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
 		t.Fatalf("expected startup recovery scope slices to be initialized")
 	}
 	if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
 		t.Fatalf("expected critical service endpoint timing defaults to be set")
 	}
--- a/internal/service/daemon.go
+++ b/internal/service/daemon.go
@ -32,8 +32,6 @@ type Daemon struct {
 	targets  []Target
 	log      *log.Logger
 	exporter *metrics.Exporter
 	postStartAutoHealOverride func(context.Context) error
 }
 var sshConfigCandidates = []string{
@ -94,9 +92,7 @@ func (d *Daemon) Run(ctx context.Context) error {
 	lastGood := map[string]time.Time{}
 	lastOnBattery := map[string]bool{}
 	onBatterySince := map[string]time.Time{}
 	breachCount := map[string]int{}
 	lastAutoHeal := time.Time{}
 	for _, t := range d.targets {
 		lastGood[t.Name] = time.Now()
 	}
@ -111,16 +107,12 @@ func (d *Daemon) Run(ctx context.Context) error {
 		case <-t.C:
 			budget := d.orch.EstimatedEmergencyShutdownSeconds()
 			threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
 			anyOnBattery := false
 			d.exporter.UpdateBudget(budget)
 			for _, target := range d.targets {
 				sample, err := target.Provider.Read(ctx)
 				if err != nil {
 					if lastOnBattery[target.Name] {
 						anyOnBattery = true
 					}
 					d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
 					d.exporter.UpdateSample(metrics.Sample{
 						Name:         target.Name,
@ -139,45 +131,17 @@ func (d *Daemon) Run(ctx context.Context) error {
 				}
 				lastGood[target.Name] = time.Now()
 				if sample.OnBattery {
 					anyOnBattery = true
 				}
 				wasOnBattery := lastOnBattery[target.Name]
 				if sample.OnBattery {
 					if !wasOnBattery || onBatterySince[target.Name].IsZero() {
 						onBatterySince[target.Name] = time.Now()
 					}
 				} else {
 					onBatterySince[target.Name] = time.Time{}
 				}
 				lastOnBattery[target.Name] = sample.OnBattery
-				onBatteryElapsed := 0
+				trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
 				if sample.OnBattery && !onBatterySince[target.Name].IsZero() {
 					onBatteryElapsed = int(time.Since(onBatterySince[target.Name]).Seconds())
 				}
 				trigger := false
 				triggerReason := ""
 				switch {
 				case sample.LowBattery:
 					trigger = true
 					triggerReason = fmt.Sprintf("ups-low-battery target=%s status=%s", target.Name, sample.RawStatus)
 				case sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold:
 					trigger = true
 					triggerReason = fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
 				case sample.OnBattery && d.cfg.UPS.OnBatteryGraceSeconds > 0 && onBatteryElapsed >= d.cfg.UPS.OnBatteryGraceSeconds:
 					trigger = true
 					triggerReason = fmt.Sprintf("ups-on-battery target=%s elapsed=%ds grace=%ds status=%s", target.Name, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RawStatus)
 				}
 				if trigger {
 					breachCount[target.Name]++
 				} else {
 					breachCount[target.Name] = 0
 				}
-				d.log.Printf("ups target=%s status=%s on_battery=%t on_battery_s=%d grace_s=%d runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
+				d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
-					target.Name, sample.RawStatus, sample.OnBattery, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
+					target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
 				d.exporter.UpdateSample(metrics.Sample{
 					Name:          target.Name,
@ -196,54 +160,14 @@ func (d *Daemon) Run(ctx context.Context) error {
 				})
 				if breachCount[target.Name] >= debounce {
-					return d.triggerShutdown(ctx, triggerReason)
+					reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
 					return d.triggerShutdown(ctx, reason)
 				}
 			}
 			d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery)
 		}
 	}
 }
 // maybeRunPostStartAutoHeal runs one orchestration or CLI step.
 // Signature: (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool).
 // Why: gives the long-running daemon a bounded path to repair post-start drift
 // like a later Vault reseal or stale dead-node deletions without waiting for a
 // fresh bootstrap run.
 func (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool) {
 	interval := time.Duration(d.cfg.Startup.PostStartAutoHealSeconds) * time.Second
 	if interval <= 0 || anyOnBattery {
 		return
 	}
 	if d.orch == nil && d.postStartAutoHealOverride == nil {
 		return
 	}
 	now := time.Now()
 	if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval {
 		return
 	}
 	if lastRun != nil {
 		*lastRun = now
 	}
 	if err := d.runPostStartAutoHeal(ctx); err != nil {
 		d.log.Printf("warning: post-start auto-heal: %v", err)
 	}
 }
 // runPostStartAutoHeal runs one orchestration or CLI step.
 // Signature: (d *Daemon) runPostStartAutoHeal(ctx context.Context) error.
 // Why: keeps the daemon loop readable while allowing unit tests to inject a
 // deterministic repair hook without a live cluster.
 func (d *Daemon) runPostStartAutoHeal(ctx context.Context) error {
 	if d.postStartAutoHealOverride != nil {
 		return d.postStartAutoHealOverride(ctx)
 	}
 	if d.orch == nil {
 		return nil
 	}
 	return d.orch.RunPostStartAutoHeal(ctx)
 }
 // triggerShutdown runs one orchestration or CLI step.
 // Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
--- a/internal/service/daemon_additional_test.go
+++ b/internal/service/daemon_additional_test.go
@ -165,50 +165,6 @@ func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) {
 	}
 }
 // TestDaemonRunTriggersShutdownAfterOnBatteryGrace runs one orchestration or CLI step.
 // Signature: TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T).
 // Why: covers the sustained-on-battery trigger so short runtime estimates are not
 // the only path to a graceful shutdown during abrupt power loss.
 func TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T) {
 	stateDir := t.TempDir()
 	orch := newDaemonTestOrchestrator(t, stateDir)
 	d := &Daemon{
 		cfg: config.Config{
 			UPS: config.UPS{
 				Enabled:               true,
 				PollSeconds:           1,
 				DebounceCount:         1,
 				RuntimeSafetyFactor:   1.0,
 				OnBatteryGraceSeconds: 1,
 			},
 			State: config.State{
 				IntentPath: filepath.Join(stateDir, "intent.json"),
 			},
 			Shutdown: config.Shutdown{
 				EmergencySkipDrain: true,
 				EmergencySkipEtcd:  true,
 			},
 		},
 		orch: orch,
 		targets: []Target{
 			{
 				Name:   "Pyrphoros",
 				Target: "pyrphoros@localhost",
 				Provider: &daemonFakeProvider{
 					samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}},
 				},
 			},
 		},
 		log:      log.New(io.Discard, "", 0),
 		exporter: metrics.New(),
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
 	if err := d.Run(ctx); err != nil {
 		t.Fatalf("expected sustained-on-battery shutdown path to complete, got %v", err)
 	}
 }
 // TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
 // Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
 // Why: covers forward-shutdown SSH execution path.
--- a/internal/service/daemon_poststart_autorepair_test.go
+++ b/internal/service/daemon_poststart_autorepair_test.go
@ -1,51 +0,0 @@
 package service
 import (
 	"context"
 	"testing"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 )
 // TestDaemonMaybeRunPostStartAutoHeal runs one orchestration or CLI step.
 // Signature: TestDaemonMaybeRunPostStartAutoHeal(t *testing.T).
 // Why: covers the daemon-side interval and on-battery guards for the new
 // post-start repair loop.
 func TestDaemonMaybeRunPostStartAutoHeal(t *testing.T) {
 	calls := 0
 	d := &Daemon{
 		cfg: config.Config{
 			Startup: config.Startup{
 				PostStartAutoHealSeconds: 10,
 			},
 		},
 		postStartAutoHealOverride: func(context.Context) error {
 			calls++
 			return nil
 		},
 	}
 	var last time.Time
 	d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
 	if calls != 1 {
 		t.Fatalf("expected first auto-heal invocation, got %d", calls)
 	}
 	d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
 	if calls != 1 {
 		t.Fatalf("expected interval guard to suppress second call, got %d", calls)
 	}
 	last = time.Now().Add(-11 * time.Second)
 	d.maybeRunPostStartAutoHeal(context.Background(), &last, true)
 	if calls != 1 {
 		t.Fatalf("expected on-battery guard to suppress call, got %d", calls)
 	}
 	last = time.Now().Add(-11 * time.Second)
 	d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
 	if calls != 2 {
 		t.Fatalf("expected second allowed auto-heal call, got %d", calls)
 	}
 }
--- a/internal/state/intent.go
+++ b/internal/state/intent.go
@ -22,23 +22,12 @@ type Intent struct {
 	UpdatedAt time.Time `json:"updated_at"`
 }
-var (
+var writeIntentImpl = writeIntentDefault
 	readIntentImpl  = readIntentDefault
 	writeIntentImpl = writeIntentDefault
 )
 // ReadIntent runs one orchestration or CLI step.
 // Signature: ReadIntent(path string) (Intent, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func ReadIntent(path string) (Intent, error) {
 	return readIntentImpl(path)
 }
 // readIntentDefault runs one orchestration or CLI step.
 // Signature: readIntentDefault(path string) (Intent, error).
 // Why: keeps production read behavior available while tests can override intent
 // reads deterministically without racing background file mutations.
 func readIntentDefault(path string) (Intent, error) {
 	b, err := os.ReadFile(path)
 	if err != nil {
 		if os.IsNotExist(err) {
--- a/internal/state/testhooks.go
+++ b/internal/state/testhooks.go
@ -22,34 +22,6 @@ func TestHookWriteIntentDefault(path string, in Intent) error {
 	return writeIntentDefault(path, in)
 }
 // TestHookReadIntentDefault runs one orchestration or CLI step.
 // Signature: TestHookReadIntentDefault(path string) (Intent, error).
 // Why: lets top-level tests delegate to production ReadIntent behavior while
 // selectively forcing deterministic read sequences for lifecycle branches.
 func TestHookReadIntentDefault(path string) (Intent, error) {
 	return readIntentDefault(path)
 }
 // TestHookSetReadIntentOverride runs one orchestration or CLI step.
 // Signature: TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()).
 // Why: enables deterministic intent-read failure injection without sleeping
 // goroutines that race slower CI agents.
 func TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()) {
 	testHookOverrideMu.Lock()
 	prev := readIntentImpl
 	if fn == nil {
 		readIntentImpl = readIntentDefault
 	} else {
 		readIntentImpl = fn
 	}
 	testHookOverrideMu.Unlock()
 	return func() {
 		testHookOverrideMu.Lock()
 		readIntentImpl = prev
 		testHookOverrideMu.Unlock()
 	}
 }
 // TestHookSetWriteIntentOverride runs one orchestration or CLI step.
 // Signature: TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()).
 // Why: enables deterministic intent-write failure injection from the top-level
--- a/scripts/install-artifacts.sh
+++ b/scripts/install-artifacts.sh
@ -1,116 +0,0 @@
 # Binary, config template, and systemd artifact helpers for the installer.
 resolve_build_target() {
  if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
    echo "./cmd/ananke"
    return 0
  fi
  return 1
 }
 install_config_template() {
  local template="$1"
  local dest="$2"
  local src legacy
  local -a modern_candidates=()
  local -a legacy_candidates=()
  case "${template}" in
    coordinator)
      modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
      legacy_candidates=("configs/hecate.titan-db.yaml")
      ;;
    peer)
      modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
      legacy_candidates=("configs/hecate.tethys.yaml")
      ;;
    example)
      modern_candidates=("configs/ananke.example.yaml")
      legacy_candidates=("configs/hecate.example.yaml")
      ;;
    *)
      echo "[install] unknown config template key: ${template}" >&2
      return 1
      ;;
  esac
  for src in "${modern_candidates[@]}"; do
    if [[ -f "${src}" ]]; then
      install -m 0640 "${src}" "${dest}"
      return 0
    fi
  done
  for legacy in "${legacy_candidates[@]}"; do
    if [[ -f "${legacy}" ]]; then
      src="$(mktemp)"
      legacy_path_rewrite "${legacy}" "${src}"
      install -m 0640 "${src}" "${dest}"
      rm -f "${src}"
      return 0
    fi
  done
  echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
  return 1
 }
 install_systemd_units() {
  local tmp
  while IFS='|' read -r target_name modern_name legacy_name; do
    local modern_src="deploy/systemd/${modern_name}"
    local legacy_src="deploy/systemd/${legacy_name}"
    local target="${SYSTEMD_DIR}/${target_name}"
    if [[ -f "${modern_src}" ]]; then
      install -m 0644 "${modern_src}" "${target}"
      continue
    fi
    if [[ -f "${legacy_src}" ]]; then
      tmp="$(mktemp)"
      legacy_path_rewrite "${legacy_src}" "${tmp}"
      install -m 0644 "${tmp}" "${target}"
      rm -f "${tmp}"
      continue
    fi
    echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
    return 1
  done <<'EOF_UNITS'
 ananke.service|ananke.service|hecate.service
 ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
 ananke-update.service|ananke-update.service|hecate-update.service
 ananke-update.timer|ananke-update.timer|hecate-update.timer
 EOF_UNITS
 }
 install_self_update_script() {
  local modern_src="scripts/ananke-self-update.sh"
  local legacy_src="scripts/hecate-self-update.sh"
  local target="${LIB_DIR}/ananke-self-update.sh"
  local tmp
  if [[ -f "${modern_src}" ]]; then
    install -m 0755 "${modern_src}" "${target}"
    return 0
  fi
  if [[ -f "${legacy_src}" ]]; then
    tmp="$(mktemp)"
    legacy_path_rewrite "${legacy_src}" "${tmp}"
    sed -Ei \
      -e 's/HECATE_/ANANKE_/g' \
      -e 's/hecate-self-update/ananke-self-update/g' \
      -e 's#/opt/hecate#/opt/ananke#g' \
      -e 's#bstein/hecate\.git#bstein/ananke.git#g' \
      "${tmp}"
    install -m 0755 "${tmp}" "${target}"
    rm -f "${tmp}"
    return 0
  fi
  echo "[install] missing both modern and legacy self-update scripts." >&2
  return 1
 }
--- a/scripts/install-config-migration.sh
+++ b/scripts/install-config-migration.sh
@ -1,334 +0,0 @@
 # Config migration helpers for the Ananke host installer.
 read_ananke_role() {
  if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
    echo "coordinator"
    return 0
  fi
  local role
  role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
  if [[ -z "${role}" ]]; then
    role="coordinator"
  fi
  echo "${role}"
 }
 migration_yaml_lookup() {
  local key="$1"
  awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
 }
 first_control_plane_name() {
  awk '
    /^control_planes:[[:space:]]*$/ {in_list=1; next}
    in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
    in_list && /^[^[:space:]]/ {in_list=0}
  ' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
 }
 lookup_node_host() {
  local node="$1"
  awk -F': *' -v n="${node}" '$1 == "  " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
 }
 migrate_ananke_config() {
  if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
    return 0
  fi
  local changed=0
  local role_hint
  role_hint="$(read_ananke_role)"
  if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
    echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
    changed=1
  fi
  if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
    echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
    changed=1
  fi
  if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
    && grep -Eq '^  titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei 's/^  titan-24:[[:space:]]*tethys[[:space:]]*$/  titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
    echo "[install] migrated ssh_node_users titan-24 override to atlas"
    changed=1
  fi
  if grep -Eq '^  command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  command_timeout_seconds:[[:space:]]*[0-9]+/a\  startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
    echo "[install] added coordination.startup_guard_max_age_seconds=900"
    changed=1
  fi
  if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei \
      -e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
      -e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
      -e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
      -e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
      "${CONF_DIR}/ananke.yaml"
    echo "[install] removed deprecated host-poweroff shutdown config keys"
    changed=1
  fi
  if grep -Eq '^  minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  minimum_battery_percent:[[:space:]]*[0-9.]+/a\  require_node_inventory_reachability: true\n  node_inventory_reachability_wait_seconds: 300\n  node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup node inventory reachability gate defaults"
    changed=1
  fi
  if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  dir:[[:space:]]*\/var\/lib\/ananke$/a\  reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
    echo "[install] added state.reports_dir default"
    changed=1
  fi
  if ! grep -Eq '^  peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
    if [[ "${role_hint}" == "peer" ]] && grep -Eq '^  forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
      local peer_host
      peer_host="$(awk -F': *' '/^  forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
      if [[ -n "${peer_host}" ]]; then
        sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts:\n    - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
        echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
        changed=1
      fi
    elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^  titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
      sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts:\n    - titan-24' "${CONF_DIR}/ananke.yaml"
      echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
      changed=1
    else
      sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts: []' "${CONF_DIR}/ananke.yaml"
      echo "[install] added coordination.peer_hosts empty default"
      changed=1
    fi
  fi
  local default_restore_cp
  default_restore_cp="$(first_control_plane_name)"
  if [[ -z "${default_restore_cp}" ]]; then
    default_restore_cp="titan-0a"
  fi
  if grep -Eq '^  api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  api_poll_seconds:[[:space:]]*[0-9]+/a\  require_time_sync: true\n  time_sync_wait_seconds: 240\n  time_sync_poll_seconds: 5\n  reconcile_access_on_boot: true\n  auto_etcd_restore_on_api_failure: true\n  etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
    changed=1
  fi
  if grep -Eq '^  api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  api_poll_seconds:[[:space:]]*[0-9]+/a\  require_time_sync: true\n  time_sync_wait_seconds: 240\n  time_sync_poll_seconds: 5\n  reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup time sync + access reconciliation defaults"
    changed=1
  fi
  if grep -Eq '^  time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  time_sync_poll_seconds:[[:space:]]*[0-9]+/a\  time_sync_mode: quorum\n  time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup time sync quorum defaults"
    changed=1
  fi
  if grep -Eq '^  etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\  require_storage_ready: true\n  storage_ready_wait_seconds: 420\n  storage_ready_poll_seconds: 5\n  storage_min_ready_nodes: 2\n  storage_critical_pvcs:\n    - vault/data-vault-0\n    - postgres/postgres-data-postgres-0\n    - gitea/gitea-data\n    - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup storage readiness defaults"
    changed=1
  fi
  if grep -Eq '^  storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^    - sso\/keycloak-data$/a\  require_post_start_probes: true\n  post_start_probe_wait_seconds: 240\n  post_start_probe_poll_seconds: 5\n  post_start_probes:\n    - https://scm.bstein.dev/api/healthz\n    - https://metrics.bstein.dev/api/health\n  require_service_checklist: true\n  service_checklist_wait_seconds: 420\n  service_checklist_poll_seconds: 5\n  service_checklist_stability_seconds: 120\n  service_checklist:\n    - name: gitea-api\n      url: https://scm.bstein.dev/api/healthz\n      accepted_statuses: [200]\n      body_contains: pass\n      timeout_seconds: 12\n    - name: grafana-api\n      url: https://metrics.bstein.dev/api/health\n      accepted_statuses: [200]\n      body_contains: '\''\"database\":\"ok\"'\''\n      timeout_seconds: 12\n  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup post-start probe + vault key fallback defaults"
    changed=1
  fi
  if grep -Eq '^    - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^    - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
    echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
    changed=1
  fi
  if ! grep -Eq '^  vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
    if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^  post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
      sed -Ei '/^    - https:\/\/metrics\.bstein\.dev\/api\/health$/a\  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
      echo "[install] added startup.vault_unseal_key_file default"
      changed=1
    fi
  fi
  if ! grep -Eq '^  vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
    if grep -Eq '^  vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
      sed -Ei '/^  vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\  vault_unseal_breakglass_command: ""\n  vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
      echo "[install] added startup break-glass fallback defaults"
      changed=1
    fi
  fi
  install_cluster_inventory_defaults "${role_hint}" && changed=1
  if [[ "${changed}" -eq 1 ]]; then
    chmod 0640 "${CONF_DIR}/ananke.yaml" || true
  fi
 }
 install_cluster_inventory_defaults() {
  local role="$1"
  local changed=0
  local inventory_block=""
  local managed_block=""
  local workers_block
  workers_block='workers:
  - titan-04
  - titan-05
  - titan-06
  - titan-07
  - titan-08
  - titan-09
  - titan-10
  - titan-11
  - titan-12
  - titan-13
  - titan-14
  - titan-15
  - titan-17
  - titan-18
  - titan-19
  - titan-20
  - titan-21
  - titan-22
  - titan-24'
  if [[ "${role}" == "coordinator" || "${role}" == "peer" ]]; then
    inventory_block='ssh_node_hosts:
  titan-db: 192.168.22.10
  titan-0a: 192.168.22.11
  titan-0b: 192.168.22.12
  titan-0c: 192.168.22.13
  titan-04: 192.168.22.30
  titan-05: 192.168.22.31
  titan-06: 192.168.22.32
  titan-07: 192.168.22.33
  titan-08: 192.168.22.34
  titan-09: 192.168.22.35
  titan-10: 192.168.22.36
  titan-11: 192.168.22.37
  titan-12: 192.168.22.40
  titan-13: 192.168.22.41
  titan-14: 192.168.22.42
  titan-15: 192.168.22.43
  titan-17: 192.168.22.45
  titan-18: 192.168.22.46
  titan-19: 192.168.22.47
  titan-20: 192.168.22.20
  titan-21: 192.168.22.21
  titan-22: 192.168.22.22
  titan-24: 192.168.22.26'
    managed_block='ssh_managed_nodes:
  - titan-db
  - titan-0a
  - titan-0b
  - titan-0c
  - titan-04
  - titan-05
  - titan-06
  - titan-07
  - titan-08
  - titan-09
  - titan-10
  - titan-11
  - titan-12
  - titan-13
  - titan-14
  - titan-15
  - titan-17
  - titan-18
  - titan-19
  - titan-20
  - titan-21
  - titan-22
  - titan-24'
  fi
  if [[ -n "${inventory_block}" ]] && grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
    perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
    echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
    changed=1
  fi
  if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
    perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
    echo "[install] hydrated workers inventory for startup/shutdown orchestration"
    changed=1
  fi
  if [[ -n "${managed_block}" ]]; then
    if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
      perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
      echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
      changed=1
    fi
    if ! grep -Eq '^  - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - titan-21$' "${CONF_DIR}/ananke.yaml"; then
      perl -0pi -e 's#ssh_managed_nodes:\n(?:  - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
      echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
      changed=1
    fi
  fi
  if [[ "${role}" == "peer" ]]; then
    install_peer_inventory_defaults && changed=1
  fi
  [[ "${changed}" -eq 1 ]]
 }
 install_peer_inventory_defaults() {
  local changed=0
  if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
    && grep -Eq '^  - titan-db$' "${CONF_DIR}/ananke.yaml" \
    && grep -Eq '^  - titan-24$' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
    perl -0pi -e 's#ssh_managed_nodes:\n  - titan-db\n  - titan-24\n#ssh_managed_nodes:\n  - titan-db\n  - titan-0a\n  - titan-0b\n  - titan-0c\n  - titan-04\n  - titan-05\n  - titan-06\n  - titan-07\n  - titan-08\n  - titan-09\n  - titan-10\n  - titan-11\n  - titan-12\n  - titan-13\n  - titan-14\n  - titan-15\n  - titan-17\n  - titan-18\n  - titan-19\n  - titan-20\n  - titan-21\n  - titan-22\n  - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
    echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
    changed=1
  fi
  if ! grep -Eq '^  - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
    perl -0pi -e 's#local_bootstrap_paths:\n(?:  - [^\n]*\n)*#local_bootstrap_paths:\n  - infrastructure/core\n  - clusters/atlas/flux-system\n  - infrastructure/sources/helm\n  - infrastructure/metallb\n  - infrastructure/traefik\n  - infrastructure/cert-manager\n  - infrastructure/vault-csi\n  - infrastructure/vault-injector\n  - services/vault\n  - infrastructure/postgres\n  - services/gitea\n  - services/keycloak\n  - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
    echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
    changed=1
  fi
  [[ "${changed}" -eq 1 ]]
 }
 sanitize_migrated_ananke_config() {
  local cfg="${CONF_DIR}/ananke.yaml"
  [[ -f "${cfg}" ]] || return 0
  local tmp changed=0
  tmp="$(mktemp)"
  # If a legacy migration bug appended root-level node entries after
  # ssh_managed_nodes, drop those orphan entries until the next top-level key.
  awk '
    BEGIN {in_managed=0}
    /^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
    {
      if (in_managed) {
        if ($0 ~ /^  - /) {print; next}
        if ($0 ~ /^- /) {next}
        if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
      }
      print
    }
  ' "${cfg}" > "${tmp}"
  if ! cmp -s "${cfg}" "${tmp}"; then
    mv "${tmp}" "${cfg}"
    changed=1
    echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
  else
    rm -f "${tmp}"
  fi
  if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
    sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
    changed=1
    echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
  fi
  if [[ "${changed}" -eq 1 ]]; then
    chmod 0640 "${cfg}" || true
  fi
 }
--- a/scripts/install-host-bootstrap.sh
+++ b/scripts/install-host-bootstrap.sh
@ -1,239 +0,0 @@
 # Host bootstrap helpers for the Ananke installer.
 resolve_nut_ups_name() {
  if [[ -n "${NUT_UPS_NAME}" ]]; then
    return 0
  fi
  if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
    local target=""
    target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
    if [[ -n "${target}" ]]; then
      NUT_UPS_NAME="${target%@localhost}"
      echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
      return 0
    fi
  fi
  NUT_UPS_NAME="pyrphoros"
  echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
 }
 ensure_ananke_kubeconfig() {
  local kubeconfig_path
  kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
  if [[ -z "${kubeconfig_path}" ]]; then
    kubeconfig_path="/etc/ananke/kubeconfig"
  fi
  install -d -m 0750 "$(dirname "${kubeconfig_path}")"
  if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
    return 0
  fi
  if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
    install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
    echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
    if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
      return 0
    fi
  fi
  local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
  cp_name="$(first_control_plane_name)"
  if [[ -z "${cp_name}" ]]; then
    echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
    return 0
  fi
  cp_host="$(lookup_node_host "${cp_name}")"
  if [[ -z "${cp_host}" ]]; then
    cp_host="${cp_name}"
  fi
  ssh_user="$(migration_yaml_lookup "ssh_user")"
  ssh_port="$(migration_yaml_lookup "ssh_port")"
  ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
  ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
  if [[ -z "${ssh_port}" ]]; then
    ssh_port="2277"
  fi
  local target
  target="${cp_host}"
  if [[ -n "${ssh_user}" ]]; then
    target="${ssh_user}@${cp_host}"
  fi
  local ssh_args=(
    -o BatchMode=yes
    -o ConnectTimeout=8
    -o StrictHostKeyChecking=accept-new
  )
  if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
    ssh_args+=(-F "${ssh_cfg}")
  fi
  if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
    ssh_args+=(-i "${ssh_key}")
  fi
  if [[ -n "${ssh_port}" ]]; then
    ssh_args+=(-p "${ssh_port}")
  fi
  local remote_cfg
  if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
    printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
    sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
    chmod 0600 "${kubeconfig_path}"
    echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
    if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
      return 0
    fi
  else
    echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
  fi
  echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
 }
 ensure_ananke_ssh_identity() {
  local key_path key_dir key_user key_comment
  key_path="$(migration_yaml_lookup "ssh_identity_file")"
  if [[ -z "${key_path}" ]]; then
    key_path="/home/atlas/.ssh/id_ed25519"
  fi
  key_dir="$(dirname "${key_path}")"
  key_comment="ananke-$(hostname)-forward"
  key_user="root"
  if [[ "${key_path}" == /home/*/* ]]; then
    key_user="${key_path#/home/}"
    key_user="${key_user%%/*}"
  fi
  if ! id "${key_user}" >/dev/null 2>&1; then
    echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
    return 0
  fi
  install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
  if [[ ! -s "${key_path}" ]]; then
    echo "[install] generating missing SSH identity at ${key_path}"
    if [[ "${key_user}" == "root" ]]; then
      ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
    else
      runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
    fi
  fi
  chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
  chmod 0600 "${key_path}" || true
  chmod 0644 "${key_path}.pub" || true
 }
 ensure_apt_packages() {
  local missing=()
  for pkg in "$@"; do
    if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
      missing+=("${pkg}")
    fi
  done
  if [[ ${#missing[@]} -eq 0 ]]; then
    return 0
  fi
  echo "[install] apt install: ${missing[*]}"
  export DEBIAN_FRONTEND=noninteractive
  apt-get update -y
  apt-get install -y "${missing[@]}"
 }
 install_kubectl_if_missing() {
  if command -v kubectl >/dev/null 2>&1; then
    return 0
  fi
  ensure_apt_packages kubernetes-client || true
  if command -v kubectl >/dev/null 2>&1; then
    return 0
  fi
  echo "[install] installing kubectl via upstream binary"
  local arch
  arch="$(uname -m)"
  case "${arch}" in
    x86_64) arch="amd64" ;;
    aarch64|arm64) arch="arm64" ;;
    *) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
  esac
  local version
  version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
  curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
  chmod 0755 /usr/local/bin/kubectl
 }
 ensure_dependencies() {
  if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
    echo "[install] skipping dependency installation"
    return 0
  fi
  if ! command -v apt-get >/dev/null 2>&1; then
    echo "This installer currently supports apt-based hosts only." >&2
    exit 1
  fi
  ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
  install_kubectl_if_missing
 }
 configure_nut() {
  if [[ "${MANAGE_NUT}" != "1" ]]; then
    echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
    return 0
  fi
  echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
  install -d -m 0755 /etc/nut /etc/udev/rules.d
  cat > /etc/nut/nut.conf <<EOF
 MODE=standalone
 EOF
  cat > /etc/nut/ups.conf <<EOF
 [${NUT_UPS_NAME}]
  driver = usbhid-ups
  port = auto
  vendorid = ${NUT_VENDOR_ID}
  productid = ${NUT_PRODUCT_ID}
  pollinterval = 5
 EOF
  cat > /etc/nut/upsd.users <<EOF
 [${NUT_MONITOR_USER}]
  password = ${NUT_MONITOR_PASSWORD}
  upsmon primary
 EOF
  chmod 0640 /etc/nut/upsd.users
  if getent group nut >/dev/null 2>&1; then
    chown root:nut /etc/nut/upsd.users
  else
    chown root:root /etc/nut/upsd.users
  fi
  cat > /etc/nut/upsmon.conf <<EOF
 RUN_AS_USER nut
 MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
 MINSUPPLIES 1
 SHUTDOWNCMD "/sbin/shutdown -h +0"
 POLLFREQ 5
 POLLFREQALERT 5
 HOSTSYNC 15
 DEADTIME 15
 POWERDOWNFLAG /etc/killpower
 EOF
  cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
 # Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
 ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
 EOF
  udevadm control --reload-rules || true
  udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
  systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
  systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
  systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
  systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
 }
--- a/scripts/install-legacy-migration.sh
+++ b/scripts/install-legacy-migration.sh
@ -1,98 +0,0 @@
 # Legacy Hecate migration helpers for the Ananke installer.
 legacy_path_rewrite() {
  local src="$1"
  local dst="$2"
  sed \
    -e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
    -e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
    -e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
    -e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
    -e 's#/opt/hecate#/opt/ananke#g' \
    -e 's#/etc/hecate#/etc/ananke#g' \
    -e 's#/var/lib/hecate#/var/lib/ananke#g' \
    -e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
    -e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
    -e 's/hecate.yaml/ananke.yaml/g' \
    -e 's/hecate.lock/ananke.lock/g' \
    -e 's/hecate/ananke/g' \
    -e 's/Hecate/Ananke/g' \
    -e 's#hecate\.lock#ananke.lock#g' \
    "${src}" > "${dst}"
 }
 migrate_legacy_hecate_install() {
  local legacy_conf_dir="/etc/hecate"
  local legacy_state_dir="/var/lib/hecate"
  local legacy_systemd_dir="/etc/systemd/system"
  install -d -m 0750 "${CONF_DIR}"
  install -d -m 0750 "${STATE_DIR}"
  if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
    echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
    legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
    chmod 0640 "${CONF_DIR}/ananke.yaml"
  fi
  if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
    echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
    install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
  fi
  if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
    echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
    install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
  fi
  if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
    echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
    install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
  fi
  if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
    echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
    install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
  fi
  if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
    echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
    install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
  fi
  if [[ -d "${legacy_systemd_dir}" ]]; then
    if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
      echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
    fi
  fi
 }
 retire_legacy_hecate_install() {
  local ts backup_dir
  ts="$(date +%Y%m%d%H%M%S)"
  backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
  systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
  systemctl stop hecate-update.service >/dev/null 2>&1 || true
  if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
    install -d -m 0750 "${backup_dir}"
    [[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
    [[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
    [[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
    [[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
    [[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
    echo "[install] backed up legacy hecate assets to ${backup_dir}"
  fi
  rm -f \
    /etc/systemd/system/hecate.service \
    /etc/systemd/system/hecate-bootstrap.service \
    /etc/systemd/system/hecate-update.service \
    /etc/systemd/system/hecate-update.timer
  rm -f /usr/local/bin/hecate
  rm -rf /usr/local/lib/hecate
  rm -rf /opt/hecate
  rm -rf /etc/hecate
  rm -rf /var/lib/hecate
 }
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -41,10 +41,829 @@ while [[ $# -gt 0 ]]; do
  esac
 done
-source "${REPO_DIR}/scripts/install-config-migration.sh"
+resolve_nut_ups_name() {
-source "${REPO_DIR}/scripts/install-host-bootstrap.sh"
+  if [[ -n "${NUT_UPS_NAME}" ]]; then
-source "${REPO_DIR}/scripts/install-legacy-migration.sh"
+    return 0
-source "${REPO_DIR}/scripts/install-artifacts.sh"
+  fi
  if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
    local target=""
    target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
    if [[ -n "${target}" ]]; then
      NUT_UPS_NAME="${target%@localhost}"
      echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
      return 0
    fi
  fi
  NUT_UPS_NAME="pyrphoros"
  echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
 }
 read_ananke_role() {
  if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
    echo "coordinator"
    return 0
  fi
  local role
  role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
  if [[ -z "${role}" ]]; then
    role="coordinator"
  fi
  echo "${role}"
 }
 migration_yaml_lookup() {
  local key="$1"
  awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
 }
 first_control_plane_name() {
  awk '
    /^control_planes:[[:space:]]*$/ {in_list=1; next}
    in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
    in_list && /^[^[:space:]]/ {in_list=0}
  ' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
 }
 lookup_node_host() {
  local node="$1"
  awk -F': *' -v n="${node}" '$1 == "  " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
 }
 ensure_ananke_kubeconfig() {
  local kubeconfig_path
  kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
  if [[ -z "${kubeconfig_path}" ]]; then
    kubeconfig_path="/etc/ananke/kubeconfig"
  fi
  install -d -m 0750 "$(dirname "${kubeconfig_path}")"
  if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
    return 0
  fi
  if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
    install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
    echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
    if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
      return 0
    fi
  fi
  local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
  cp_name="$(first_control_plane_name)"
  if [[ -z "${cp_name}" ]]; then
    echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
    return 0
  fi
  cp_host="$(lookup_node_host "${cp_name}")"
  if [[ -z "${cp_host}" ]]; then
    cp_host="${cp_name}"
  fi
  ssh_user="$(migration_yaml_lookup "ssh_user")"
  ssh_port="$(migration_yaml_lookup "ssh_port")"
  ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
  ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
  if [[ -z "${ssh_port}" ]]; then
    ssh_port="2277"
  fi
  local target
  target="${cp_host}"
  if [[ -n "${ssh_user}" ]]; then
    target="${ssh_user}@${cp_host}"
  fi
  local ssh_args=(
    -o BatchMode=yes
    -o ConnectTimeout=8
    -o StrictHostKeyChecking=accept-new
  )
  if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
    ssh_args+=(-F "${ssh_cfg}")
  fi
  if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
    ssh_args+=(-i "${ssh_key}")
  fi
  if [[ -n "${ssh_port}" ]]; then
    ssh_args+=(-p "${ssh_port}")
  fi
  local remote_cfg
  if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
    printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
    sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
    chmod 0600 "${kubeconfig_path}"
    echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
    if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
      return 0
    fi
  else
    echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
  fi
  echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
 }
 ensure_ananke_ssh_identity() {
  local key_path key_dir key_user key_comment
  key_path="$(migration_yaml_lookup "ssh_identity_file")"
  if [[ -z "${key_path}" ]]; then
    key_path="/home/atlas/.ssh/id_ed25519"
  fi
  key_dir="$(dirname "${key_path}")"
  key_comment="ananke-$(hostname)-forward"
  key_user="root"
  if [[ "${key_path}" == /home/*/* ]]; then
    key_user="${key_path#/home/}"
    key_user="${key_user%%/*}"
  fi
  if ! id "${key_user}" >/dev/null 2>&1; then
    echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
    return 0
  fi
  install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
  if [[ ! -s "${key_path}" ]]; then
    echo "[install] generating missing SSH identity at ${key_path}"
    if [[ "${key_user}" == "root" ]]; then
      ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
    else
      runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
    fi
  fi
  chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
  chmod 0600 "${key_path}" || true
  chmod 0644 "${key_path}.pub" || true
 }
 migrate_ananke_config() {
  if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
    return 0
  fi
  local changed=0
  local role_hint
  role_hint="$(read_ananke_role)"
  if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
    echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
    changed=1
  fi
  if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
    echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
    changed=1
  fi
  if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
    && grep -Eq '^  titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei 's/^  titan-24:[[:space:]]*tethys[[:space:]]*$/  titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
    echo "[install] migrated ssh_node_users titan-24 override to atlas"
    changed=1
  fi
  if grep -Eq '^  command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  command_timeout_seconds:[[:space:]]*[0-9]+/a\  startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
    echo "[install] added coordination.startup_guard_max_age_seconds=900"
    changed=1
  fi
  if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei \
      -e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
      -e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
      -e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
      -e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
      "${CONF_DIR}/ananke.yaml"
    echo "[install] removed deprecated host-poweroff shutdown config keys"
    changed=1
  fi
  if grep -Eq '^  minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  minimum_battery_percent:[[:space:]]*[0-9.]+/a\  require_node_inventory_reachability: true\n  node_inventory_reachability_wait_seconds: 300\n  node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup node inventory reachability gate defaults"
    changed=1
  fi
  if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  dir:[[:space:]]*\/var\/lib\/ananke$/a\  reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
    echo "[install] added state.reports_dir default"
    changed=1
  fi
  if ! grep -Eq '^  peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
    if [[ "${role_hint}" == "peer" ]] && grep -Eq '^  forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
      local peer_host
      peer_host="$(awk -F': *' '/^  forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
      if [[ -n "${peer_host}" ]]; then
        sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts:\n    - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
        echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
        changed=1
      fi
    elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^  titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
      sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts:\n    - titan-24' "${CONF_DIR}/ananke.yaml"
      echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
      changed=1
    else
      sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts: []' "${CONF_DIR}/ananke.yaml"
      echo "[install] added coordination.peer_hosts empty default"
      changed=1
    fi
  fi
  local default_restore_cp
  default_restore_cp="$(first_control_plane_name)"
  if [[ -z "${default_restore_cp}" ]]; then
    default_restore_cp="titan-0a"
  fi
  if grep -Eq '^  api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  api_poll_seconds:[[:space:]]*[0-9]+/a\  require_time_sync: true\n  time_sync_wait_seconds: 240\n  time_sync_poll_seconds: 5\n  reconcile_access_on_boot: true\n  auto_etcd_restore_on_api_failure: true\n  etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
    changed=1
  fi
  if grep -Eq '^  api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  api_poll_seconds:[[:space:]]*[0-9]+/a\  require_time_sync: true\n  time_sync_wait_seconds: 240\n  time_sync_poll_seconds: 5\n  reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup time sync + access reconciliation defaults"
    changed=1
  fi
  if grep -Eq '^  time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  time_sync_poll_seconds:[[:space:]]*[0-9]+/a\  time_sync_mode: quorum\n  time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup time sync quorum defaults"
    changed=1
  fi
  if grep -Eq '^  etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^  etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\  require_storage_ready: true\n  storage_ready_wait_seconds: 420\n  storage_ready_poll_seconds: 5\n  storage_min_ready_nodes: 2\n  storage_critical_pvcs:\n    - vault/data-vault-0\n    - postgres/postgres-data-postgres-0\n    - gitea/gitea-data\n    - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup storage readiness defaults"
    changed=1
  fi
  if grep -Eq '^  storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
    && ! grep -Eq '^  require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^    - sso\/keycloak-data$/a\  require_post_start_probes: true\n  post_start_probe_wait_seconds: 240\n  post_start_probe_poll_seconds: 5\n  post_start_probes:\n    - https://scm.bstein.dev/api/healthz\n    - https://metrics.bstein.dev/api/health\n  require_service_checklist: true\n  service_checklist_wait_seconds: 420\n  service_checklist_poll_seconds: 5\n  service_checklist_stability_seconds: 120\n  service_checklist:\n    - name: gitea-api\n      url: https://scm.bstein.dev/api/healthz\n      accepted_statuses: [200]\n      body_contains: pass\n      timeout_seconds: 12\n    - name: grafana-api\n      url: https://metrics.bstein.dev/api/health\n      accepted_statuses: [200]\n      body_contains: '\''\"database\":\"ok\"'\''\n      timeout_seconds: 12\n  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
    echo "[install] added startup post-start probe + vault key fallback defaults"
    changed=1
  fi
  if grep -Eq '^    - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
    sed -Ei '/^    - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
    echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
    changed=1
  fi
  if ! grep -Eq '^  vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
    if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^  post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
      sed -Ei '/^    - https:\/\/metrics\.bstein\.dev\/api\/health$/a\  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
      echo "[install] added startup.vault_unseal_key_file default"
      changed=1
    fi
  fi
  if ! grep -Eq '^  vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
    if grep -Eq '^  vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
      sed -Ei '/^  vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\  vault_unseal_breakglass_command: ""\n  vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
      echo "[install] added startup break-glass fallback defaults"
      changed=1
    fi
  fi
  local role
  role="$(read_ananke_role)"
  local inventory_block
  local managed_block
  local workers_block
  workers_block='workers:
  - titan-04
  - titan-05
  - titan-06
  - titan-07
  - titan-08
  - titan-09
  - titan-10
  - titan-11
  - titan-12
  - titan-13
  - titan-14
  - titan-15
  - titan-17
  - titan-18
  - titan-19
  - titan-20
  - titan-21
  - titan-22
  - titan-24'
  if [[ "${role}" == "coordinator" ]]; then
    inventory_block='ssh_node_hosts:
  titan-db: 192.168.22.10
  titan-0a: 192.168.22.11
  titan-0b: 192.168.22.12
  titan-0c: 192.168.22.13
  titan-04: 192.168.22.30
  titan-05: 192.168.22.31
  titan-06: 192.168.22.32
  titan-07: 192.168.22.33
  titan-08: 192.168.22.34
  titan-09: 192.168.22.35
  titan-10: 192.168.22.36
  titan-11: 192.168.22.37
  titan-12: 192.168.22.40
  titan-13: 192.168.22.41
  titan-14: 192.168.22.42
  titan-15: 192.168.22.43
  titan-17: 192.168.22.45
  titan-18: 192.168.22.46
  titan-19: 192.168.22.47
  titan-20: 192.168.22.20
  titan-21: 192.168.22.21
  titan-22: 192.168.22.22
  titan-24: 192.168.22.26'
    managed_block='ssh_managed_nodes:
  - titan-db
  - titan-0a
  - titan-0b
  - titan-0c
  - titan-04
  - titan-05
  - titan-06
  - titan-07
  - titan-08
  - titan-09
  - titan-10
  - titan-11
  - titan-12
  - titan-13
  - titan-14
  - titan-15
  - titan-17
  - titan-18
  - titan-19
  - titan-20
  - titan-21
  - titan-22
  - titan-24'
  elif [[ "${role}" == "peer" ]]; then
    inventory_block='ssh_node_hosts:
  titan-db: 192.168.22.10
  titan-0a: 192.168.22.11
  titan-0b: 192.168.22.12
  titan-0c: 192.168.22.13
  titan-04: 192.168.22.30
  titan-05: 192.168.22.31
  titan-06: 192.168.22.32
  titan-07: 192.168.22.33
  titan-08: 192.168.22.34
  titan-09: 192.168.22.35
  titan-10: 192.168.22.36
  titan-11: 192.168.22.37
  titan-12: 192.168.22.40
  titan-13: 192.168.22.41
  titan-14: 192.168.22.42
  titan-15: 192.168.22.43
  titan-17: 192.168.22.45
  titan-18: 192.168.22.46
  titan-19: 192.168.22.47
  titan-20: 192.168.22.20
  titan-21: 192.168.22.21
  titan-22: 192.168.22.22
  titan-24: 192.168.22.26'
    managed_block='ssh_managed_nodes:
  - titan-db
  - titan-0a
  - titan-0b
  - titan-0c
  - titan-04
  - titan-05
  - titan-06
  - titan-07
  - titan-08
  - titan-09
  - titan-10
  - titan-11
  - titan-12
  - titan-13
  - titan-14
  - titan-15
  - titan-17
  - titan-18
  - titan-19
  - titan-20
  - titan-21
  - titan-22
  - titan-24'
  fi
  if [[ -n "${inventory_block}" ]]; then
    if grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
      perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
      echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
      changed=1
    fi
  fi
  if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
    perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
    echo "[install] hydrated workers inventory for startup/shutdown orchestration"
    changed=1
  fi
  if [[ -n "${managed_block}" ]]; then
    if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
      perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
      echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
      changed=1
    fi
    if ! grep -Eq '^  - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - titan-21$' "${CONF_DIR}/ananke.yaml"; then
      perl -0pi -e 's#ssh_managed_nodes:\n(?:  - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
      echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
      changed=1
    fi
  fi
  if [[ "${role}" == "peer" ]]; then
    if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
      && grep -Eq '^  - titan-db$' "${CONF_DIR}/ananke.yaml" \
      && grep -Eq '^  - titan-24$' "${CONF_DIR}/ananke.yaml" \
      && ! grep -Eq '^  - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
      perl -0pi -e 's#ssh_managed_nodes:\n  - titan-db\n  - titan-24\n#ssh_managed_nodes:\n  - titan-db\n  - titan-0a\n  - titan-0b\n  - titan-0c\n  - titan-04\n  - titan-05\n  - titan-06\n  - titan-07\n  - titan-08\n  - titan-09\n  - titan-10\n  - titan-11\n  - titan-12\n  - titan-13\n  - titan-14\n  - titan-15\n  - titan-17\n  - titan-18\n  - titan-19\n  - titan-20\n  - titan-21\n  - titan-22\n  - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
      echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
      changed=1
    fi
    if ! grep -Eq '^  - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
      perl -0pi -e 's#local_bootstrap_paths:\n(?:  - [^\n]*\n)*#local_bootstrap_paths:\n  - infrastructure/core\n  - clusters/atlas/flux-system\n  - infrastructure/sources/helm\n  - infrastructure/metallb\n  - infrastructure/traefik\n  - infrastructure/cert-manager\n  - infrastructure/vault-csi\n  - infrastructure/vault-injector\n  - services/vault\n  - infrastructure/postgres\n  - services/gitea\n  - services/keycloak\n  - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
      echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
      changed=1
    fi
  fi
  if [[ "${changed}" -eq 1 ]]; then
    chmod 0640 "${CONF_DIR}/ananke.yaml" || true
  fi
 }
 sanitize_migrated_ananke_config() {
  local cfg="${CONF_DIR}/ananke.yaml"
  [[ -f "${cfg}" ]] || return 0
  local tmp changed=0
  tmp="$(mktemp)"
  # Legacy migration bug guard:
  # If root-level "- node" entries were accidentally appended after ssh_managed_nodes,
  # drop those orphan entries until the next top-level key.
  awk '
    BEGIN {in_managed=0}
    /^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
    {
      if (in_managed) {
        if ($0 ~ /^  - /) {print; next}
        if ($0 ~ /^- /) {next}
        if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
      }
      print
    }
  ' "${cfg}" > "${tmp}"
  if ! cmp -s "${cfg}" "${tmp}"; then
    mv "${tmp}" "${cfg}"
    changed=1
    echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
  else
    rm -f "${tmp}"
  fi
  if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
    sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
    changed=1
    echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
  fi
  if [[ "${changed}" -eq 1 ]]; then
    chmod 0640 "${cfg}" || true
  fi
 }
 ensure_apt_packages() {
  local missing=()
  for pkg in "$@"; do
    if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
      missing+=("${pkg}")
    fi
  done
  if [[ ${#missing[@]} -eq 0 ]]; then
    return 0
  fi
  echo "[install] apt install: ${missing[*]}"
  export DEBIAN_FRONTEND=noninteractive
  apt-get update -y
  apt-get install -y "${missing[@]}"
 }
 install_kubectl_if_missing() {
  if command -v kubectl >/dev/null 2>&1; then
    return 0
  fi
  ensure_apt_packages kubernetes-client || true
  if command -v kubectl >/dev/null 2>&1; then
    return 0
  fi
  echo "[install] installing kubectl via upstream binary"
  local arch
  arch="$(uname -m)"
  case "${arch}" in
    x86_64) arch="amd64" ;;
    aarch64|arm64) arch="arm64" ;;
    *) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
  esac
  local version
  version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
  curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
  chmod 0755 /usr/local/bin/kubectl
 }
 ensure_dependencies() {
  if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
    echo "[install] skipping dependency installation"
    return 0
  fi
  if ! command -v apt-get >/dev/null 2>&1; then
    echo "This installer currently supports apt-based hosts only." >&2
    exit 1
  fi
  ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
  install_kubectl_if_missing
 }
 legacy_path_rewrite() {
  local src="$1"
  local dst="$2"
  sed \
    -e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
    -e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
    -e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
    -e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
    -e 's#/opt/hecate#/opt/ananke#g' \
    -e 's#/etc/hecate#/etc/ananke#g' \
    -e 's#/var/lib/hecate#/var/lib/ananke#g' \
    -e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
    -e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
    -e 's/hecate.yaml/ananke.yaml/g' \
    -e 's/hecate.lock/ananke.lock/g' \
    -e 's/hecate/ananke/g' \
    -e 's/Hecate/Ananke/g' \
    -e 's#hecate\.lock#ananke.lock#g' \
    "${src}" > "${dst}"
 }
 migrate_legacy_hecate_install() {
  local legacy_conf_dir="/etc/hecate"
  local legacy_state_dir="/var/lib/hecate"
  local legacy_systemd_dir="/etc/systemd/system"
  install -d -m 0750 "${CONF_DIR}"
  install -d -m 0750 "${STATE_DIR}"
  if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
    echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
    legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
    chmod 0640 "${CONF_DIR}/ananke.yaml"
  fi
  if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
    echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
    install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
  fi
  if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
    echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
    install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
  fi
  if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
    echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
    install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
  fi
  if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
    echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
    install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
  fi
  if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
    echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
    install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
  fi
  if [[ -d "${legacy_systemd_dir}" ]]; then
    if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
      echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
    fi
  fi
 }
 retire_legacy_hecate_install() {
  local ts backup_dir
  ts="$(date +%Y%m%d%H%M%S)"
  backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
  systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
  systemctl stop hecate-update.service >/dev/null 2>&1 || true
  if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
    install -d -m 0750 "${backup_dir}"
    [[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
    [[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
    [[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
    [[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
    [[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
    echo "[install] backed up legacy hecate assets to ${backup_dir}"
  fi
  rm -f \
    /etc/systemd/system/hecate.service \
    /etc/systemd/system/hecate-bootstrap.service \
    /etc/systemd/system/hecate-update.service \
    /etc/systemd/system/hecate-update.timer
  rm -f /usr/local/bin/hecate
  rm -rf /usr/local/lib/hecate
  rm -rf /opt/hecate
  rm -rf /etc/hecate
  rm -rf /var/lib/hecate
 }
 resolve_build_target() {
  if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
    echo "./cmd/ananke"
    return 0
  fi
  return 1
 }
 install_config_template() {
  local template="$1"
  local dest="$2"
  local src legacy
  local -a modern_candidates=()
  local -a legacy_candidates=()
  case "${template}" in
    coordinator)
      modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
      legacy_candidates=("configs/hecate.titan-db.yaml")
      ;;
    peer)
      modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
      legacy_candidates=("configs/hecate.tethys.yaml")
      ;;
    example)
      modern_candidates=("configs/ananke.example.yaml")
      legacy_candidates=("configs/hecate.example.yaml")
      ;;
    *)
      echo "[install] unknown config template key: ${template}" >&2
      return 1
      ;;
  esac
  for src in "${modern_candidates[@]}"; do
    if [[ -f "${src}" ]]; then
      install -m 0640 "${src}" "${dest}"
      return 0
    fi
  done
  for legacy in "${legacy_candidates[@]}"; do
    if [[ -f "${legacy}" ]]; then
      src="$(mktemp)"
      legacy_path_rewrite "${legacy}" "${src}"
      install -m 0640 "${src}" "${dest}"
      rm -f "${src}"
      return 0
    fi
  done
  echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
  return 1
 }
 install_systemd_units() {
  local source_map
  local tmp
  while IFS='|' read -r target_name modern_name legacy_name; do
    local modern_src="deploy/systemd/${modern_name}"
    local legacy_src="deploy/systemd/${legacy_name}"
    local target="${SYSTEMD_DIR}/${target_name}"
    if [[ -f "${modern_src}" ]]; then
      install -m 0644 "${modern_src}" "${target}"
      continue
    fi
    if [[ -f "${legacy_src}" ]]; then
      tmp="$(mktemp)"
      legacy_path_rewrite "${legacy_src}" "${tmp}"
      install -m 0644 "${tmp}" "${target}"
      rm -f "${tmp}"
      continue
    fi
    echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
    return 1
  done <<'EOF_UNITS'
 ananke.service|ananke.service|hecate.service
 ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
 ananke-update.service|ananke-update.service|hecate-update.service
 ananke-update.timer|ananke-update.timer|hecate-update.timer
 EOF_UNITS
 }
 install_self_update_script() {
  local modern_src="scripts/ananke-self-update.sh"
  local legacy_src="scripts/hecate-self-update.sh"
  local target="${LIB_DIR}/ananke-self-update.sh"
  local tmp
  if [[ -f "${modern_src}" ]]; then
    install -m 0755 "${modern_src}" "${target}"
    return 0
  fi
  if [[ -f "${legacy_src}" ]]; then
    tmp="$(mktemp)"
    legacy_path_rewrite "${legacy_src}" "${tmp}"
    sed -Ei \
      -e 's/HECATE_/ANANKE_/g' \
      -e 's/hecate-self-update/ananke-self-update/g' \
      -e 's#/opt/hecate#/opt/ananke#g' \
      -e 's#bstein/hecate\.git#bstein/ananke.git#g' \
      "${tmp}"
    install -m 0755 "${tmp}" "${target}"
    rm -f "${tmp}"
    return 0
  fi
  echo "[install] missing both modern and legacy self-update scripts." >&2
  return 1
 }
 configure_nut() {
  if [[ "${MANAGE_NUT}" != "1" ]]; then
    echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
    return 0
  fi
  echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
  install -d -m 0755 /etc/nut /etc/udev/rules.d
  cat > /etc/nut/nut.conf <<EOF
 MODE=standalone
 EOF
  cat > /etc/nut/ups.conf <<EOF
 [${NUT_UPS_NAME}]
  driver = usbhid-ups
  port = auto
  vendorid = ${NUT_VENDOR_ID}
  productid = ${NUT_PRODUCT_ID}
  pollinterval = 5
 EOF
  cat > /etc/nut/upsd.users <<EOF
 [${NUT_MONITOR_USER}]
  password = ${NUT_MONITOR_PASSWORD}
  upsmon primary
 EOF
  chmod 0640 /etc/nut/upsd.users
  if getent group nut >/dev/null 2>&1; then
    chown root:nut /etc/nut/upsd.users
  else
    chown root:root /etc/nut/upsd.users
  fi
  cat > /etc/nut/upsmon.conf <<EOF
 RUN_AS_USER nut
 MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
 MINSUPPLIES 1
 SHUTDOWNCMD "/sbin/shutdown -h +0"
 POLLFREQ 5
 POLLFREQALERT 5
 HOSTSYNC 15
 DEADTIME 15
 POWERDOWNFLAG /etc/killpower
 EOF
  cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
 # Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
 ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
 EOF
  udevadm control --reload-rules || true
  udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
  systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
  systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
  systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
  systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
 }
 ensure_dependencies
 migrate_legacy_hecate_install
--- a/scripts/lint.sh
+++ b/scripts/lint.sh
@ -6,28 +6,9 @@ cd "${REPO_DIR}"
 export PATH="$(go env GOPATH)/bin:${PATH}"
 STATICCHECK_VERSION="${ANANKE_STATICCHECK_VERSION:-2025.1.1}"
 run_with_retry() {
  local attempts="$1"
  shift
  local try=1
  local delay=3
  local rc=0
  while true; do
    "$@" && return 0
    rc=$?
    if [[ "${try}" -ge "${attempts}" ]]; then
      return "${rc}"
    fi
    echo "[lint] retry ${try}/${attempts} after rc=${rc}: $*" >&2
    sleep "${delay}"
    delay=$((delay * 2))
    try=$((try + 1))
  done
 }
 if ! command -v staticcheck >/dev/null 2>&1 || ! staticcheck -version 2>/dev/null | grep -q "${STATICCHECK_VERSION}"; then
  echo "[lint] installing staticcheck ${STATICCHECK_VERSION}"
-  run_with_retry 4 go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}"
+  go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}"
 fi
 echo "[lint] go vet"
--- a/scripts/publish_quality_metrics.py
+++ b/scripts/publish_quality_metrics.py
@ -77,17 +77,6 @@ def _fetch_existing_counter(pushgateway_url: str, metric: str, labels: dict[str,
    return 0.0
 def _series_exists(pushgateway_url: str, metric: str, labels: dict[str, str], timeout_seconds: float) -> bool:
    """Return whether Pushgateway already has a series for this build."""
    text = _read_http(f"{pushgateway_url.rstrip('/')}/metrics", timeout_seconds)
    for line in text.splitlines():
        if not line.startswith(metric + "{"):
            continue
        if all(f'{key}="{value}"' in line for key, value in labels.items()):
            return True
    return False
 def _build_payload(
    suite: str,
    trigger: str,
@ -100,25 +89,9 @@ def _build_payload(
    tests_skipped: int,
    test_cases: list[tuple[str, str]],
    coverage_percent: float,
    source_files_total: int,
    source_lines_over_500: int,
    branch: str,
    build_number: str,
    jenkins_job: str,
    checks: dict[str, str],
 ) -> str:
    build_labels = {
        "suite": suite,
        "branch": branch,
        "build_number": build_number or "unknown",
        "jenkins_job": jenkins_job,
    }
    test_case_base_labels = {
        "suite": suite,
        "branch": branch,
        "build_number": build_number or "unknown",
        "jenkins_job": jenkins_job,
    }
    lines = [
        "# TYPE platform_quality_gate_runs_total counter",
        f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count}',
@ -132,30 +105,21 @@ def _build_payload(
        f'ananke_quality_gate_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
        "# TYPE platform_quality_gate_workspace_line_coverage_percent gauge",
        f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
        "# TYPE platform_quality_gate_source_files_total gauge",
        f'platform_quality_gate_source_files_total{{suite="{suite}"}} {source_files_total}',
        "# TYPE platform_quality_gate_source_lines_over_500_total gauge",
        f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
-        "# TYPE platform_quality_gate_build_info gauge",
+        "# TYPE platform_quality_gate_test_case_result gauge",
        f"platform_quality_gate_build_info{_label_str(build_labels)} 1",
        "# TYPE ananke_quality_gate_checks_total gauge",
        "# TYPE ananke_quality_gate_publish_info gauge",
        f'ananke_quality_gate_publish_info{_label_str({"suite": suite, "trigger": trigger})} 1',
    ]
    lines.extend(
        f'platform_quality_gate_test_case_result{{suite="{suite}",test="{_escape_label(test_name)}",status="{_escape_label(test_status)}"}} 1'
        for test_name, test_status in test_cases
    )
    lines.extend(
        f'ananke_quality_gate_checks_total{{suite="{suite}",check="{check_name}",result="{check_status}"}} 1'
        for check_name, check_status in checks.items()
    )
    lines.append("# TYPE platform_quality_gate_test_case_result gauge")
    if test_cases:
        lines.extend(
            f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': test_name, 'status': test_status})} 1"
            for test_name, test_status in test_cases
        )
    else:
        lines.append(
            f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': '__no_test_cases__', 'status': 'skipped'})} 1"
        )
    return "\n".join(lines) + "\n"
@ -172,7 +136,8 @@ def _read_coverage_percent(path: str) -> float:
        return 0.0
-def _iter_source_files(repo_root: Path):
+def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
    count = 0
    for rel_root in SOURCE_SCAN_ROOTS:
        base = repo_root / rel_root
        if not base.exists():
@ -182,37 +147,12 @@ def _iter_source_files(repo_root: Path):
                continue
            if path.suffix not in SOURCE_EXTENSIONS:
                continue
-            if path.name.endswith("_test.go") or path.name.endswith(".test.py"):
+            lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
-                continue
+            if lines > max_lines:
-            yield path
+                count += 1
 def _count_source_files(repo_root: Path) -> int:
    return sum(1 for _ in _iter_source_files(repo_root))
 def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
    count = 0
    for path in _iter_source_files(repo_root):
        lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
        if lines > max_lines:
            count += 1
    return count
 def _unit_tests_failed(output_path: Path, coverage_percent: float) -> bool:
    if coverage_percent <= 0 or not output_path.exists():
        return True
    text = output_path.read_text(encoding="utf-8", errors="ignore")
    start_marker = "[quality] unit tests + workspace coverage profile"
    end_marker = "[quality] hygiene: doc contracts"
    if start_marker in text:
        text = text.split(start_marker, 1)[1]
    if end_marker in text:
        text = text.split(end_marker, 1)[0]
    return bool(re.search(r"^(--- FAIL:|FAIL\\b)", text, flags=re.M))
 def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
    if not output_path.exists():
        return {"passed": 0, "failed": 0, "errors": 0, "skipped": 0}
@ -226,37 +166,14 @@ def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
 def _parse_go_test_cases(output_path: Path) -> list[tuple[str, str]]:
    """Parse per-test status records from go test output text."""
    if not output_path.exists():
        return []
    text = output_path.read_text(encoding="utf-8", errors="ignore")
    cases: list[tuple[str, str]] = []
-    patterns = {
+    for match in re.finditer(r"^---\s+(PASS|FAIL|SKIP):\s+(\S+)", text, flags=re.M):
-        "passed": re.compile(r"^--- PASS: ([^\s(]+)", flags=re.M),
+        raw_status, test_name = match.groups()
-        "failed": re.compile(r"^--- FAIL: ([^\s(]+)", flags=re.M),
+        status = {"PASS": "passed", "FAIL": "failed", "SKIP": "skipped"}.get(raw_status, "error")
-        "skipped": re.compile(r"^--- SKIP: ([^\s(]+)", flags=re.M),
+        cases.append((test_name.strip(), status))
    }
    for status, pattern in patterns.items():
        for test_name in pattern.findall(text):
            cleaned = str(test_name).strip()
            if cleaned:
                cases.append((cleaned, status))
    if cases:
        return cases
    # Fallback for non-verbose `go test` output where individual test names are absent.
    package_cases: list[tuple[str, str]] = []
    for package_name in re.findall(r"^ok\s+([^\s]+)", text, flags=re.M):
        cleaned = str(package_name).strip()
        if cleaned:
            package_cases.append((f"package::{cleaned}", "passed"))
    for package_name in re.findall(r"^FAIL\s+([^\s]+)", text, flags=re.M):
        cleaned = str(package_name).strip()
        if cleaned:
            package_cases.append((f"package::{cleaned}", "failed"))
    if package_cases:
        deduped = list(dict.fromkeys(package_cases))
        return deduped
    return cases
@ -307,23 +224,17 @@ def _sonarqube_check_status(build_dir: Path) -> str:
 def _supply_chain_check_status(build_dir: Path) -> str:
    required = os.getenv("QUALITY_GATE_IRONBANK_REQUIRED", "0").strip().lower() in {"1", "true", "yes", "on"}
    report = _load_json(Path(os.getenv("QUALITY_GATE_IRONBANK_REPORT", str(build_dir / "ironbank-compliance.json"))))
    if not report:
-        return "failed" if required else "not_applicable"
+        return "not_applicable"
    compliant = report.get("compliant")
    if isinstance(compliant, bool):
        return "ok" if compliant else "failed"
    status_candidates = [report.get("status"), report.get("result"), report.get("compliance")]
    for value in status_candidates:
        if isinstance(value, str):
-            normalized = value.strip().lower()
+            return "ok" if value.strip().lower() in QUALITY_SUCCESS_STATES else "failed"
-            if normalized in QUALITY_SUCCESS_STATES:
+    return "failed"
                return "ok"
            if normalized in {"n/a", "na", "not_applicable", "not-applicable", "skipped", "skip"}:
                return "failed" if required else "not_applicable"
            return "failed" if required else "not_applicable"
    return "failed" if required else "not_applicable"
 def parse_args(argv: list[str]) -> argparse.Namespace:
@ -367,19 +278,10 @@ def main(argv: list[str] | None = None) -> int:
    args = parse_args(argv or sys.argv[1:])
    repo_root = Path(__file__).resolve().parents[1]
    build_dir = repo_root / "build"
    gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
    current_ok = 1 if gate_rc == 0 else 0
    current_failed = 0 if gate_rc == 0 else 1
    branch = os.getenv("BRANCH_NAME") or os.getenv("GIT_BRANCH") or "unknown"
    if branch.startswith("origin/"):
        branch = branch[len("origin/") :]
    build_number = os.getenv("BUILD_NUMBER", "")
    jenkins_job = os.getenv("JOB_NAME", "ananke")
    remote_ok = 0
    remote_failed = 0
    remote_error = ""
    already_recorded = False
    try:
        remote_ok = int(
            _fetch_existing_counter(
@ -397,39 +299,21 @@ def main(argv: list[str] | None = None) -> int:
                args.timeout_seconds,
            )
        )
        already_recorded = bool(build_number) and _series_exists(
            args.pushgateway_url,
            "platform_quality_gate_build_info",
            {
                "job": args.job_name,
                "suite": args.suite,
                "branch": branch or "unknown",
                "build_number": build_number or "unknown",
                "jenkins_job": jenkins_job,
            },
            args.timeout_seconds,
        )
    except Exception as exc:
        remote_error = str(exc)
-    resolved_ok = remote_ok
+    resolved_ok = max(args.local_ok, remote_ok)
-    resolved_failed = remote_failed
+    resolved_failed = max(args.local_failed, remote_failed)
    if remote_error:
        resolved_ok = args.local_ok
        resolved_failed = args.local_failed
    elif not already_recorded:
        resolved_ok += current_ok
        resolved_failed += current_failed
    coverage_percent = _read_coverage_percent(args.coverage_percent_file)
    source_files_total = _count_source_files(repo_root)
    source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500)
-    quality_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
+    test_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
-    tests = _parse_go_test_counts(quality_output)
+    tests = _parse_go_test_counts(test_output)
-    test_cases = _parse_go_test_cases(quality_output)
+    test_cases = _parse_go_test_cases(test_output)
    gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
    docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status"))))
-    unit_tests_failed = _unit_tests_failed(quality_output, coverage_percent)
+    gate_failed = gate_rc != 0
    checks = {
-        "tests": "failed" if unit_tests_failed or tests["failed"] > 0 or tests["errors"] > 0 else "ok",
+        "tests": "failed" if gate_failed or tests["failed"] > 0 else "ok",
        "coverage": "ok" if coverage_percent >= 95.0 else "failed",
        "loc": "ok" if source_lines_over_500 == 0 else "failed",
        "docs_naming": docs_status,
@ -448,11 +332,7 @@ def main(argv: list[str] | None = None) -> int:
        tests_skipped=tests["skipped"],
        test_cases=test_cases,
        coverage_percent=coverage_percent,
        source_files_total=source_files_total,
        source_lines_over_500=source_lines_over_500,
        branch=branch,
        build_number=build_number,
        jenkins_job=jenkins_job,
        checks=checks,
    )
@ -465,8 +345,7 @@ def main(argv: list[str] | None = None) -> int:
    summary = (
        f"[quality] published Pushgateway metrics suite={args.suite} job={args.job_name} ok={resolved_ok} "
-        f"failed={resolved_failed} coverage={coverage_percent:.3f} source_files_total={source_files_total} "
+        f"failed={resolved_failed} coverage={coverage_percent:.3f} source_lines_over_500={source_lines_over_500}"
        f"source_lines_over_500={source_lines_over_500}"
    )
    if remote_error:
        summary += f" remote_read_error={remote_error}"
--- a/scripts/publish_quality_metrics_test.py
+++ b/scripts/publish_quality_metrics_test.py
@ -3,11 +3,8 @@
 from __future__ import annotations
 import http.server
 from pathlib import Path
 import socketserver
 import tempfile
 import threading
 from unittest import mock
 import unittest
 import publish_quality_metrics as publisher
@ -61,19 +58,7 @@ class PublishQualityMetricsTest(unittest.TestCase):
        self.server.server_close()
        self.thread.join(timeout=5)
-    def _env_for_gate_status(self, status: int = 0) -> dict[str, str]:
+    def test_publish_uses_remote_high_water_mark(self) -> None:
        tmp_dir = tempfile.TemporaryDirectory()
        self.addCleanup(tmp_dir.cleanup)
        rc_path = Path(tmp_dir.name) / "quality-gate.rc"
        rc_path.write_text(f"{status}\n", encoding="utf-8")
        return {
            "ANANKE_QUALITY_EXIT_CODE_PATH": str(rc_path),
            "ANANKE_QUALITY_COVERAGE_PERCENT_FILE": str(Path(tmp_dir.name) / "coverage.txt"),
            "ANANKE_QUALITY_OUTPUT_FILE": str(Path(tmp_dir.name) / "quality-gate.out"),
            "ANANKE_QUALITY_DOCS_STATUS_PATH": str(Path(tmp_dir.name) / "docs-naming.status"),
        }
    def test_publish_adds_current_run_to_remote_counters(self) -> None:
        _GatewayHandler.metrics_text = "\n".join(
            [
                '# TYPE platform_quality_gate_runs_total counter',
@ -82,93 +67,51 @@ class PublishQualityMetricsTest(unittest.TestCase):
            ]
        )
-        with mock.patch.dict("os.environ", self._env_for_gate_status(0)):
+        exit_code = publisher.main(
-            exit_code = publisher.main(
+            [
-                [
+                "--pushgateway-url",
-                    "--pushgateway-url",
+                self.base_url,
-                    self.base_url,
+                "--job-name",
-                    "--job-name",
+                "platform-quality-ci",
-                    "platform-quality-ci",
+                "--suite",
-                    "--suite",
+                "ananke",
-                    "ananke",
+                "--trigger",
-                    "--trigger",
+                "host",
-                    "host",
+                "--local-ok",
-                    "--local-ok",
+                "5",
-                    "5",
+                "--local-failed",
-                    "--local-failed",
+                "2",
-                    "2",
+            ]
-                ]
+        )
            )
        self.assertEqual(exit_code, 0)
        self.assertEqual(len(_GatewayHandler.posts), 1)
        path, body = _GatewayHandler.posts[0]
        self.assertEqual(path, "/metrics/job/platform-quality-ci/suite/ananke")
-        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 8', body)
+        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
-        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body)
+        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 2', body)
        self.assertIn('ananke_quality_gate_publish_info{suite="ananke",trigger="host"} 1', body)
        self.assertIn('ananke_quality_gate_coverage_percent{suite="ananke"}', body)
        self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
        self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
        self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)
    def test_publish_does_not_double_count_same_build(self) -> None:
        _GatewayHandler.metrics_text = "\n".join(
            [
                'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="ok"} 7',
                'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="failed"} 1',
                'platform_quality_gate_build_info{job="platform-quality-ci",suite="ananke",branch="main",build_number="78",jenkins_job="ananke"} 1',
            ]
        )
        with mock.patch.dict(
            "os.environ",
            {
                **self._env_for_gate_status(0),
                "BRANCH_NAME": "main",
                "BUILD_NUMBER": "78",
                "JOB_NAME": "ananke",
            },
        ):
            exit_code = publisher.main(
                [
                    "--pushgateway-url",
                    self.base_url,
                    "--job-name",
                    "platform-quality-ci",
                    "--suite",
                    "ananke",
                    "--trigger",
                    "host",
                    "--local-ok",
                    "1",
                    "--local-failed",
                    "0",
                ]
            )
        self.assertEqual(exit_code, 0)
        _, body = _GatewayHandler.posts[0]
        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body)
    def test_publish_falls_back_to_local_counters_when_metrics_read_fails(self) -> None:
        _GatewayHandler.fail_metrics_read = True
-        with mock.patch.dict("os.environ", self._env_for_gate_status(0)):
+        exit_code = publisher.main(
-            exit_code = publisher.main(
+            [
-                [
+                "--pushgateway-url",
-                    "--pushgateway-url",
+                self.base_url,
-                    self.base_url,
+                "--job-name",
-                    "--job-name",
+                "platform-quality-ci",
-                    "platform-quality-ci",
+                "--suite",
-                    "--suite",
+                "ananke",
-                    "ananke",
+                "--local-ok",
-                    "--local-ok",
+                "11",
-                    "11",
+                "--local-failed",
-                    "--local-failed",
+                "3",
-                    "3",
+            ]
-                ]
+        )
            )
        self.assertEqual(exit_code, 0)
        self.assertEqual(len(_GatewayHandler.posts), 1)
@ -176,7 +119,6 @@ class PublishQualityMetricsTest(unittest.TestCase):
        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 11', body)
        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 3', body)
        self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
        self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
        self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)
--- a/scripts/quality_gate.sh
+++ b/scripts/quality_gate.sh
@ -158,9 +158,15 @@ mkdir -p "${BUILD_DIR}"
 rm -f "${COVERAGE_PROFILE}" "${COVERAGE_PERCENT_FILE}"
 printf 'failed\n' > "${BUILD_DIR}/docs-naming.status"
-echo "[quality] dependency download"
+echo "[quality] unit tests + workspace coverage profile"
 export GOPROXY="${GOPROXY:-https://proxy.golang.org,direct}"
 run_with_retry 4 go mod download
 run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
 coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
 if [[ -z "${coverage_percent}" ]]; then
  coverage_percent="0"
 fi
 printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
 echo "[quality] hygiene: doc contracts"
 cd testing
@ -183,14 +189,6 @@ echo "[quality] lint"
 echo "[quality] installer template contracts"
 ./scripts/verify_install_templates.sh
 echo "[quality] unit tests + workspace coverage profile"
 run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
 coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
 if [[ -z "${coverage_percent}" ]]; then
  coverage_percent="0"
 fi
 printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
 echo "[quality] per-file coverage gate (95%)"
 cd testing
 ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v
--- a/testing/hygiene/hygiene_test.go
+++ b/testing/hygiene/hygiene_test.go
@ -17,12 +17,6 @@ import (
 const maxGoFileLOC = 500
 var goFileNamePattern = regexp.MustCompile(`^[a-z0-9]+(_[a-z0-9]+)*(_test)?\.go$`)
 var genericFileNameTokens = map[string]struct{}{
 	"chunk": {},
 	"part":  {},
 	"piece": {},
 	"split": {},
 }
 func repoRoot(tb testing.TB) string {
 	tb.Helper()
@ -67,16 +61,13 @@ func collectGoFiles(tb testing.TB, roots ...string) []string {
 func TestHygieneContracts(t *testing.T) {
 	root := repoRoot(t)
 	files := collectGoFiles(t, filepath.Join(root, "cmd"), filepath.Join(root, "internal"))
 	namingFiles := append([]string{}, files...)
 	namingFiles = append(namingFiles, collectGoFiles(t, filepath.Join(root, "testing"))...)
 	sort.Strings(files)
 	sort.Strings(namingFiles)
 	t.Run("doc_contract", func(t *testing.T) {
 		checkDocContracts(t, files)
 	})
 	t.Run("naming_contract", func(t *testing.T) {
-		checkNamingContracts(t, namingFiles)
+		checkNamingContracts(t, files)
 	})
 	t.Run("loc_limit", func(t *testing.T) {
 		checkFileLOCLimits(t, files)
@ -130,19 +121,9 @@ func checkNamingContracts(t *testing.T, files []string) {
 		if !goFileNamePattern.MatchString(base) {
 			t.Errorf("%s: filename %q violates naming contract %s", file, base, goFileNamePattern.String())
 		}
 		for _, token := range filenameTokens(base) {
 			if _, ok := genericFileNameTokens[token]; ok {
 				t.Errorf("%s: filename %q uses generic split-file token %q", file, base, token)
 			}
 		}
 	}
 }
 func filenameTokens(name string) []string {
 	trimmed := strings.TrimSuffix(strings.TrimSuffix(name, ".go"), "_test")
 	return strings.Split(trimmed, "_")
 }
 // checkFileLOCLimits runs one orchestration or CLI step.
 // Signature: checkFileLOCLimits(t *testing.T, files []string).
 // Why: A strict LOC cap forces focused files and keeps refactors manageable.
--- a/testing/hygiene/in_tree_test_allowlist.txt
+++ b/testing/hygiene/in_tree_test_allowlist.txt
@ -13,8 +13,6 @@ cmd/ananke/power_safety_test.go
 cmd/ananke/test_helpers_test.go
 internal/cluster/orchestrator_inventory_test.go
 internal/cluster/orchestrator_report_test.go
 internal/cluster/orchestrator_autorepair_test.go
 internal/cluster/orchestrator_autorepair_cleanup_test.go
 internal/cluster/orchestrator_test.go
 internal/cluster/orchestrator_unit_additional_test.go
 internal/cluster/orchestrator_vault_test.go
@ -23,7 +21,6 @@ internal/config/load_additional_test.go
 internal/config/validate_matrix_test.go
 internal/service/daemon_additional_test.go
 internal/service/daemon_coverage_closeout_test.go
 internal/service/daemon_poststart_autorepair_test.go
 internal/service/daemon_quality_branches_test.go
 internal/service/daemon_test.go
 internal/sshutil/repair_test.go
--- a/testing/orchestrator/hooks_access_failure_matrix_test.go
+++ b/testing/orchestrator/hooks_access_failure_matrix_test.go
@ -363,3 +363,4 @@ func TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T) {
 		}
 	})
 }
--- a/testing/orchestrator/hooks_flux_workload_matrix_test.go
+++ b/testing/orchestrator/hooks_flux_workload_matrix_test.go
@ -79,29 +79,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
 		}
 	})
 	t.Run("flux-health-required-kustomizations-ignore-optional-failures", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/monitoring"}
 		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 			command := name + " " + strings.Join(args, " ")
 			switch {
 			case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
 				return `{"items":[
 {"metadata":{"namespace":"flux-system","name":"monitoring"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}},
 {"metadata":{"namespace":"flux-system","name":"jellyfin"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"optional still down"}]}}
 ]}`, nil
 			default:
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
 		}
 		orch, _ := newHookOrchestrator(t, cfg, run, run)
 		ready, detail, err := orch.TestHookFluxHealthReady(context.Background())
 		if err != nil || !ready || !strings.Contains(detail, "required kustomizations ready=") {
 			t.Fatalf("expected required flux kustomization scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
 		}
 	})
 	t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.WorkloadConvergenceWaitSeconds = 1
@ -168,42 +145,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
 		}
 	})
 	t.Run("required-workload-namespaces-ignore-optional-failure-pods", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
 		cfg.Startup.StuckPodGraceSeconds = 1
 		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 			command := name + " " + strings.Join(args, " ")
 			switch {
 			case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
 				return `{"items":[
 {"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
 {"kind":"Deployment","metadata":{"namespace":"jellyfin","name":"pegasus"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
 ]}`, nil
 			case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
 				return `{"items":[
 {"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"grafana"}]},"status":{"containerStatuses":[{"state":{"running":{}}}]}},
 {"metadata":{"namespace":"jellyfin","name":"pegasus-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"pegasus"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"pegasus"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}
 ]}`, nil
 			default:
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
 		}
 		orch, _ := newHookOrchestrator(t, cfg, run, run)
 		ready, detail, err := orch.TestHookWorkloadConvergenceReady(context.Background())
 		if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
 			t.Fatalf("expected required workload namespace scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
 		}
 		failures, err := orch.TestHookStartupFailurePods(context.Background())
 		if err != nil {
 			t.Fatalf("startup failure pod query: %v", err)
 		}
 		if len(failures) != 0 {
 			t.Fatalf("expected optional namespace failures to be ignored, got %v", failures)
 		}
 	})
 	t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
--- a/testing/orchestrator/hooks_vault_lifecycle_branch_matrix_test.go
+++ b/testing/orchestrator/hooks_vault_lifecycle_branch_matrix_test.go
@ -19,11 +19,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
-// newHookOrchestratorWithRunnerMode runs one orchestration or CLI step.
+// newHookOrchestratorAdvanced runs one orchestration or CLI step.
-// Signature: newHookOrchestratorWithRunnerMode(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
+// Signature: newHookOrchestratorAdvanced(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
-// Why: these scenarios needs dry-run and non-dry-run variants while keeping
+// Why: this part10 matrix needs dry-run and non-dry-run variants while keeping
 // command dispatch deterministic from the top-level testing module.
-func newHookOrchestratorWithRunnerMode(
+func newHookOrchestratorAdvanced(
 	t *testing.T,
 	cfg config.Config,
 	dryRun bool,
@ -49,11 +49,11 @@ func newHookOrchestratorWithRunnerMode(
 	return orch, recorder
 }
-// TestHookVaultLifecycleBranchMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart10LowFileClosure runs one orchestration or CLI step.
-// Signature: TestHookVaultLifecycleBranchMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart10LowFileClosure(t *testing.T).
 // Why: closes remaining branch gaps on low-coverage orchestrator files using
 // targeted hook-level scenarios instead of brittle full-drill reruns.
-func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
+func TestHookGapMatrixPart10LowFileClosure(t *testing.T) {
 	t.Run("critical-vault-low-branches", func(t *testing.T) {
 		t.Run("vault-sealed-parse-error", func(t *testing.T) {
 			cfg := lifecycleConfig(t)
@ -64,7 +64,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			if _, err := orch.TestHookVaultSealed(context.Background()); err == nil || !strings.Contains(err.Error(), "parse vault status") {
 				t.Fatalf("expected vault status parse error branch, got %v", err)
 			}
@ -81,7 +81,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			if _, err := orch.TestHookVaultUnsealKey(context.Background()); err == nil || !strings.Contains(err.Error(), "vault-init unseal key is empty") {
 				t.Fatalf("expected empty decoded unseal key branch, got %v", err)
 			}
@ -90,7 +90,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 		t.Run("write-unseal-key-file-write-error", func(t *testing.T) {
 			cfg := lifecycleConfig(t)
 			cfg.Startup.VaultUnsealKeyFile = t.TempDir()
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
 			if err := orch.TestHookWriteVaultUnsealKeyFile("vault-key"); err == nil || !strings.Contains(err.Error(), "write vault unseal key file") {
 				t.Fatalf("expected write failure branch when key path is a directory, got %v", err)
 			}
@ -105,7 +105,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchNoValue, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runNoValue, runNoValue)
+			orchNoValue, _ := newHookOrchestratorAdvanced(t, cfg, false, runNoValue, runNoValue)
 			ready, err := orchNoValue.TestHookWorkloadReady(context.Background(), "vault", "statefulset", "vault")
 			if err != nil || ready {
 				t.Fatalf("expected no-value readiness branch, ready=%v err=%v", ready, err)
@ -124,7 +124,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orchEnsureErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runEnsureErr, runEnsureErr)
+			orchEnsureErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runEnsureErr, runEnsureErr)
 			if err := orchEnsureErr.TestHookEnsureCriticalStartupWorkloads(context.Background()); err == nil || !strings.Contains(err.Error(), "rollout failed") {
 				t.Fatalf("expected ensureCriticalStartupWorkloads wait error branch, got %v", err)
 			}
@ -139,7 +139,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchPhase, _ := newHookOrchestratorWithRunnerMode(t, cfgPhase, false, runPhase, runPhase)
+			orchPhase, _ := newHookOrchestratorAdvanced(t, cfgPhase, false, runPhase, runPhase)
 			if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") {
 				t.Fatalf("expected pod phase guard branch, got %v", err)
 			}
@ -170,7 +170,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return runFollowup(ctx, timeout, name, args...)
 			}
-			orchFollowup, _ := newHookOrchestratorWithRunnerMode(t, cfgFollowup, false, runFollowup, runSensitive)
+			orchFollowup, _ := newHookOrchestratorAdvanced(t, cfgFollowup, false, runFollowup, runSensitive)
 			if err := orchFollowup.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "vault status check failed") {
 				t.Fatalf("expected follow-up sealed status error branch, got %v", err)
 			}
@ -204,7 +204,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			err := orch.TestHookDrainWorkers(context.Background(), workers)
 			if err == nil || !strings.Contains(err.Error(), "drain workers had 5 errors") {
 				t.Fatalf("expected drain aggregation branch, got %v", err)
@ -217,7 +217,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 			cfg.SSHManagedNodes = []string{"titan-db"}
 			rec := &commandRecorder{}
 			base := lifecycleDispatcher(rec)
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
 			orch.TestHookRunSSHAcrossNodes(context.Background(), []string{"titan-db", "not-managed"}, "noop", "echo ok")
 			if !rec.contains("atlas@titan-db echo ok") {
 				t.Fatalf("expected managed ssh execution branch")
@ -233,7 +233,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			if _, err := orch.TestHookLatestEtcdSnapshotPath(context.Background(), "titan-db"); err == nil || !strings.Contains(err.Error(), "no etcd snapshots found") {
 				t.Fatalf("expected empty snapshot-list branch, got %v", err)
 			}
@ -250,7 +250,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchWorkers, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runWorkers, runWorkers)
+			orchWorkers, _ := newHookOrchestratorAdvanced(t, cfg, false, runWorkers, runWorkers)
 			workers, err := orchWorkers.TestHookEffectiveWorkers(context.Background())
 			if err != nil || len(workers) == 0 {
 				t.Fatalf("expected inventory worker fallback branch, workers=%v err=%v", workers, err)
@ -273,7 +273,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orchWrite, _ := newHookOrchestratorWithRunnerMode(t, cfgWrite, false, runWrite, runWrite)
+			orchWrite, _ := newHookOrchestratorAdvanced(t, cfgWrite, false, runWrite, runWrite)
 			if err := orchWrite.TestHookScaleDownApps(context.Background()); err == nil || !strings.Contains(err.Error(), "write scaled workload snapshot") {
 				t.Fatalf("expected scaled snapshot write-failure branch, got %v", err)
 			}
@ -294,7 +294,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orchReady, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runReady, runReady)
+			orchReady, _ := newHookOrchestratorAdvanced(t, cfg, false, runReady, runReady)
 			ready, detail, err := orchReady.TestHookFluxHealthReady(context.Background())
 			if err != nil || ready || !strings.Contains(detail, "ready=false") {
 				t.Fatalf("expected flux ready-reason fallback branch, ready=%v detail=%q err=%v", ready, detail, err)
@ -319,7 +319,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			ctx, cancel := context.WithCancel(context.Background())
 			cancel()
 			if err := orch.TestHookWaitForFluxHealth(ctx); !errors.Is(err, context.Canceled) {
@ -336,7 +336,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 			}
 			rec := &commandRecorder{}
 			base := lifecycleDispatcher(rec)
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
 			if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
 				t.Fatalf("expected ensureRequiredNodeLabels skip/apply branches, got %v", err)
 			}
@ -347,7 +347,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 		t.Run("wait-for-startup-convergence-dryrun-and-critical-endpoint-fail", func(t *testing.T) {
 			cfgDry := lifecycleConfig(t)
-			orchDry, _ := newHookOrchestratorWithRunnerMode(t, cfgDry, true, nil, nil)
+			orchDry, _ := newHookOrchestratorAdvanced(t, cfgDry, true, nil, nil)
 			if err := orchDry.TestHookWaitForStartupConvergence(context.Background()); err != nil {
 				t.Fatalf("expected startup convergence dry-run fast path, got %v", err)
 			}
@ -365,7 +365,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchFail, _ := newHookOrchestratorWithRunnerMode(t, cfgFail, false, run, run)
+			orchFail, _ := newHookOrchestratorAdvanced(t, cfgFail, false, run, run)
 			if err := orchFail.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "query endpoints") {
 				t.Fatalf("expected critical-endpoint convergence failure branch, got %v", err)
 			}
@ -373,7 +373,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 		t.Run("ingress-namespace-discovery-empty-and-query-error", func(t *testing.T) {
 			cfg := lifecycleConfig(t)
-			orchEmpty, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
+			orchEmpty, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
 			namespaces, err := orchEmpty.TestHookDiscoverIngressNamespacesForHost(context.Background(), " ")
 			if err != nil || len(namespaces) != 0 {
 				t.Fatalf("expected empty-host fast path, namespaces=%v err=%v", namespaces, err)
@ -386,7 +386,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runErr, runErr)
+			orchErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runErr, runErr)
 			if _, err := orchErr.TestHookDiscoverIngressNamespacesForHost(context.Background(), "metrics.bstein.dev"); err == nil || !strings.Contains(err.Error(), "query ingresses") {
 				t.Fatalf("expected ingress query error branch, got %v", err)
 			}
@ -412,7 +412,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				URL:              "http://" + listener.Addr().String() + "/health",
 				AcceptedStatuses: []int{200},
 			}}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
 			ready, detail := orch.TestHookServiceChecklistReady(context.Background())
 			if ready || !strings.Contains(detail, "http://") {
 				t.Fatalf("expected service checklist URL-name fallback failure, ready=%v detail=%q", ready, detail)
@ -435,7 +435,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			ctx, cancel := context.WithCancel(context.Background())
 			cancel()
 			if err := orch.TestHookWaitForNodeInventoryReachability(ctx); !errors.Is(err, context.Canceled) {
@ -456,7 +456,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			ctx, cancel := context.WithCancel(context.Background())
 			cancel()
 			if err := orch.TestHookWaitForPostStartProbes(ctx); !errors.Is(err, context.Canceled) {
@ -478,7 +478,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			if err := orch.TestHookResumeFluxAndReconcile(context.Background()); err != nil {
 				t.Fatalf("expected resume flux warning-only branch, got %v", err)
 			}
@ -505,7 +505,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			ctx, cancel := context.WithCancel(context.Background())
 			cancel()
 			if err := orch.TestHookWaitForTimeSync(ctx, []string{"", "titan-db"}); !errors.Is(err, context.Canceled) {
@ -532,14 +532,14 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			if err := orch.TestHookWaitForWorkloadConvergence(context.Background()); err != nil {
 				t.Fatalf("expected workload convergence default-branch success, got %v", err)
 			}
 			cfgIgnore := lifecycleConfig(t)
 			cfgIgnore.Startup.AutoRecycleStuckPods = false
-			orchIgnoreDry, _ := newHookOrchestratorWithRunnerMode(t, cfgIgnore, true, run, run)
+			orchIgnoreDry, _ := newHookOrchestratorAdvanced(t, cfgIgnore, true, run, run)
 			now := time.Now().UTC().Add(-time.Hour)
 			orchIgnoreDry.TestHookMaybeAutoRecycleStuckPods(context.Background(), &now)
 			orchIgnoreDry.TestHookMaybeAutoHealCriticalWorkloadReplicas(context.Background(), &now)
@ -551,7 +551,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchHealErr, _ := newHookOrchestratorWithRunnerMode(t, lifecycleConfig(t), false, runHealErr, runHealErr)
+			orchHealErr, _ := newHookOrchestratorAdvanced(t, lifecycleConfig(t), false, runHealErr, runHealErr)
 			if _, err := orchHealErr.TestHookHealCriticalWorkloadReplicas(context.Background()); err == nil || !strings.Contains(err.Error(), "query workloads") {
 				t.Fatalf("expected critical workload heal query-error branch, got %v", err)
 			}
--- a/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go
+++ b/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go
@ -20,7 +20,7 @@ import (
 // newLifecycleMatrixOrchestrator runs one orchestration or CLI step.
 // Signature: newLifecycleMatrixOrchestrator(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride, kubeconfig string) *cluster.Orchestrator.
-// Why: lifecycle cleanup scenarios need direct control over runner dry-run and kubeconfig branches.
+// Why: part11 needs direct control over runner dry-run and kubeconfig branches.
 func newLifecycleMatrixOrchestrator(
 	t *testing.T,
 	cfg config.Config,
@ -49,11 +49,11 @@ func newLifecycleMatrixOrchestrator(
 	return orch
 }
-// TestHookLifecycleCleanupRemainingClosure runs one orchestration or CLI step.
+// TestHookGapMatrixPart11RemainingClosure runs one orchestration or CLI step.
-// Signature: TestHookLifecycleCleanupRemainingClosure(t *testing.T).
+// Signature: TestHookGapMatrixPart11RemainingClosure(t *testing.T).
 // Why: closes final branch gaps for lifecycle + remaining near-threshold
 // orchestrator files so per-file coverage reaches the enforced 95% target.
-func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
+func TestHookGapMatrixPart11RemainingClosure(t *testing.T) {
 	t.Run("critical-vault-final-closures", func(t *testing.T) {
 		t.Run("ensure-critical-cleanup-error-and-cleanup-branches", func(t *testing.T) {
 			cfg := lifecycleConfig(t)
@ -633,7 +633,7 @@ func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
 						switch {
 						case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
 							apiVersionCalls++
-							if apiVersionCalls <= 2 {
+							if apiVersionCalls == 1 {
 								return "", errors.New("api down")
 							}
 							return "v1.31.0", nil
--- a/testing/orchestrator/hooks_state_access_coordination_matrix_test.go
+++ b/testing/orchestrator/hooks_state_access_coordination_matrix_test.go
--- a/testing/orchestrator/hooks_timesync_flux_vault_matrix_test.go
+++ b/testing/orchestrator/hooks_timesync_flux_vault_matrix_test.go
@ -17,11 +17,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
-// TestHookTimesyncAndStabilityMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart2TimesyncAndStability runs one orchestration or CLI step.
-// Signature: TestHookTimesyncAndStabilityMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart2TimesyncAndStability(t *testing.T).
 // Why: drives low-coverage time-sync, datastore parsing, and startup stability
 // branches from the top-level testing module.
-func TestHookTimesyncAndStabilityMatrix(t *testing.T) {
+func TestHookGapMatrixPart2TimesyncAndStability(t *testing.T) {
 	t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) {
 		cases := []struct {
 			line string
@ -162,11 +162,11 @@ func TestHookTimesyncAndStabilityMatrix(t *testing.T) {
 	})
 }
-// TestHookFluxScalingReportMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart2FluxScalingReport runs one orchestration or CLI step.
-// Signature: TestHookFluxScalingReportMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart2FluxScalingReport(t *testing.T).
 // Why: targets low branch density in flux-health, scaling snapshot handling,
 // and report sanitization helpers.
-func TestHookFluxScalingReportMatrix(t *testing.T) {
+func TestHookGapMatrixPart2FluxScalingReport(t *testing.T) {
 	t.Run("flux-helper-matrix", func(t *testing.T) {
 		if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") {
 			t.Fatalf("expected immutable matcher true for uppercase+job variant")
@ -241,11 +241,11 @@ func TestHookFluxScalingReportMatrix(t *testing.T) {
 	})
 }
-// TestHookVaultAndCoordinationMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart2VaultAndCoordination runs one orchestration or CLI step.
-// Signature: TestHookVaultAndCoordinationMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart2VaultAndCoordination(t *testing.T).
 // Why: raises branch coverage on vault/key and coordination helpers without
 // requiring package-local tests.
-func TestHookVaultAndCoordinationMatrix(t *testing.T) {
+func TestHookGapMatrixPart2VaultAndCoordination(t *testing.T) {
 	t.Run("vault-unseal-and-file-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.VaultUnsealKeyFile = ""
@ -296,11 +296,11 @@ func TestHookVaultAndCoordinationMatrix(t *testing.T) {
 	})
 }
-// TestHookWorkloadIgnoreMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart2WorkloadIgnore runs one orchestration or CLI step.
-// Signature: TestHookWorkloadIgnoreMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart2WorkloadIgnore(t *testing.T).
 // Why: expands low branch coverage in workload ignore helpers and startup-failure
 // pod classification.
-func TestHookWorkloadIgnoreMatrix(t *testing.T) {
+func TestHookGapMatrixPart2WorkloadIgnore(t *testing.T) {
 	t.Run("ignored-node-helper-matrix", func(t *testing.T) {
 		if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) {
 			t.Fatalf("expected selector-host ignored match")
--- a/testing/orchestrator/hooks_convergence_lifecycle_restore_matrix_test.go
+++ b/testing/orchestrator/hooks_convergence_lifecycle_restore_matrix_test.go
@ -11,11 +11,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/config"
 )
-// TestHookConvergenceAndStabilityMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart3ConvergenceAndStability runs one orchestration or CLI step.
-// Signature: TestHookConvergenceAndStabilityMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T).
 // Why: raises coverage for startup convergence orchestration and stability gates
 // that determine whether startup is considered truly complete.
-func TestHookConvergenceAndStabilityMatrix(t *testing.T) {
+func TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T) {
 	t.Run("wait-for-startup-convergence-gate-matrix", func(t *testing.T) {
 		cfgIngress := lifecycleConfig(t)
 		cfgIngress.Startup.RequireIngressChecklist = true
@ -108,11 +108,11 @@ func TestHookConvergenceAndStabilityMatrix(t *testing.T) {
 	})
 }
-// TestHookLifecycleRestoreShutdownMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart3LifecycleRestoreShutdown runs one orchestration or CLI step.
-// Signature: TestHookLifecycleRestoreShutdownMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T).
 // Why: fills lifecycle restore/shutdown success paths that are easy to miss in
 // failure-focused drill tests.
-func TestHookLifecycleRestoreShutdownMatrix(t *testing.T) {
+func TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T) {
 	t.Run("etcd-restore-dry-run-and-success", func(t *testing.T) {
 		cfgDry := lifecycleConfig(t)
 		dry := newDryRunHookOrchestrator(t, cfgDry, nil)
--- a/testing/orchestrator/hooks_coordination_reachability_matrix_test.go
+++ b/testing/orchestrator/hooks_coordination_reachability_matrix_test.go
@ -19,11 +19,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
-// TestHookCoordinationAndReachabilityMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart4CoordinationAndReachability runs one orchestration or CLI step.
-// Signature: TestHookCoordinationAndReachabilityMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T).
 // Why: closes remaining coordination/reachability low branches with deterministic
 // command responses and short timeouts.
-func TestHookCoordinationAndReachabilityMatrix(t *testing.T) {
+func TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T) {
 	t.Run("peer-shutdown-complete-cooldown-blocks-startup", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Coordination.PeerHosts = []string{"titan-24"}
@ -136,11 +136,11 @@ func TestHookCoordinationAndReachabilityMatrix(t *testing.T) {
 	})
 }
-// TestHookIngressServiceAndPostStartMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart4IngressServiceAndPostStart runs one orchestration or CLI step.
-// Signature: TestHookIngressServiceAndPostStartMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T).
 // Why: drives ingress/service checklist and post-start branches that were still
 // under-covered after drill-focused matrix tests.
-func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
+func TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T) {
 	t.Run("ingress-backend-autoscale-cooldown-and-host-parser", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
@ -194,11 +194,11 @@ func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
 		ok, detail := orch.TestHookServiceCheckReady(context.Background(), config.ServiceChecklistCheck{
-			Name:             "forbidden-marker",
+			Name:            "forbidden-marker",
-			URL:              srv.URL,
+			URL:             srv.URL,
 			AcceptedStatuses: []int{200},
-			BodyNotContains:  "marker",
+			BodyNotContains: "marker",
-			TimeoutSeconds:   2,
+			TimeoutSeconds:  2,
 		})
 		if ok || !strings.Contains(detail, "forbidden marker") {
 			t.Fatalf("expected forbidden-marker branch, ok=%v detail=%q", ok, detail)
@ -233,11 +233,11 @@ func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
 	})
 }
-// TestHookReportScalingStorageDrainMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart4ReportScalingStorageDrain runs one orchestration or CLI step.
-// Signature: TestHookReportScalingStorageDrainMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T).
 // Why: covers artifact, scaling snapshot, storage, and drain error branches that
 // are difficult to hit from happy-path lifecycle drills.
-func TestHookReportScalingStorageDrainMatrix(t *testing.T) {
+func TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T) {
 	t.Run("report-artifact-and-progress-error-paths", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		reportsFile := filepath.Join(t.TempDir(), "reports-as-file")
@ -339,11 +339,11 @@ func TestHookReportScalingStorageDrainMatrix(t *testing.T) {
 	})
 }
-// TestHookTimesyncLifecycleAndAccessMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart4TimesyncLifecycleAndAccess runs one orchestration or CLI step.
-// Signature: TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T).
 // Why: closes remaining timing/access/lifecycle branches that still sat below
 // target after the earlier matrices.
-func TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T) {
+func TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T) {
 	t.Run("timesync-quorum-and-strict-failure-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.TimeSyncMode = "quorum"
--- a/testing/orchestrator/hooks_endpoint_healing_ingress_mapping_test.go
+++ b/testing/orchestrator/hooks_endpoint_healing_ingress_mapping_test.go
@ -20,11 +20,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
-// TestHookEndpointHealingCoverageClosure runs one orchestration or CLI step.
+// TestHookGapMatrixPart5CoverageClosure runs one orchestration or CLI step.
-// Signature: TestHookEndpointHealingCoverageClosure(t *testing.T).
+// Signature: TestHookGapMatrixPart5CoverageClosure(t *testing.T).
 // Why: closes branch gaps that still remained after drill-style tests by driving
 // low-coverage orchestrator internals through the exported top-level hook surface.
-func TestHookEndpointHealingCoverageClosure(t *testing.T) {
+func TestHookGapMatrixPart5CoverageClosure(t *testing.T) {
 	t.Run("critical-endpoint-backend-heal-matrix", func(t *testing.T) {
 		t.Run("empty-namespace-service-noop", func(t *testing.T) {
 			orch, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
@ -491,10 +491,10 @@ func httpStatusHandler(code int, body string) func(http.ResponseWriter, *http.Re
 	}
 }
-// TestHookIngressHostMappingRegression runs one orchestration or CLI step.
+// TestHookGapMatrixPart5IngressHostMappingRegression runs one orchestration or CLI step.
-// Signature: TestHookIngressHostMappingRegression(t *testing.T).
+// Signature: TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T).
 // Why: ensures host parsing fallback paths stay stable for ingress/service checklist failures.
-func TestHookIngressHostMappingRegression(t *testing.T) {
+func TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T) {
 	cfg := lifecycleConfig(t)
 	cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
 		{Name: "metrics", URL: "https://metrics.bstein.dev/api/health"},
--- a/testing/orchestrator/hooks_vault_poststart_branch_matrix_test.go
+++ b/testing/orchestrator/hooks_vault_poststart_branch_matrix_test.go
@ -16,11 +16,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
-// TestHookVaultPostStartBranchMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart6CoverageClosure runs one orchestration or CLI step.
-// Signature: TestHookVaultPostStartBranchMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart6CoverageClosure(t *testing.T).
-// Why: targets the remaining low branch paths after endpoint-healing coverage so per-file coverage
+// Why: targets the remaining low branch paths after part5 so per-file coverage
 // can move toward the strict 95% quality gate.
-func TestHookVaultPostStartBranchMatrix(t *testing.T) {
+func TestHookGapMatrixPart6CoverageClosure(t *testing.T) {
 	t.Run("critical-vault-and-poststart-branches", func(t *testing.T) {
 		t.Run("wait-vault-ready-dryrun-and-cancel", func(t *testing.T) {
 			cfg := lifecycleConfig(t)
--- a/testing/orchestrator/hooks_workload_storage_access_matrix_test.go
+++ b/testing/orchestrator/hooks_workload_storage_access_matrix_test.go
@ -14,11 +14,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
-// TestHookWorkloadStorageAccessMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart7CoverageClosure runs one orchestration or CLI step.
-// Signature: TestHookWorkloadStorageAccessMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart7CoverageClosure(t *testing.T).
 // Why: closes additional low-coverage branches in convergence, storage, access,
 // flux, lifecycle, and sensitive command wrappers.
-func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
+func TestHookGapMatrixPart7CoverageClosure(t *testing.T) {
 	t.Run("workload-convergence-branch-matrix", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.WorkloadConvergenceWaitSeconds = 1
@ -165,32 +165,6 @@ func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
 				t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
 			}
 		})
 		t.Run("wait-for-node-gates-only-require-core-nodes", func(t *testing.T) {
 			cfg := lifecycleConfig(t)
 			cfg.Startup.RequireNodeSSHAuth = true
 			cfg.Startup.NodeSSHAuthWaitSeconds = 1
 			cfg.Startup.NodeSSHAuthPollSeconds = 1
 			cfg.Startup.NodeInventoryReachWaitSeconds = 1
 			cfg.Startup.NodeInventoryReachPollSeconds = 1
 			cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
 			cfg.Startup.NodeSSHAuthRequiredNodes = []string{"titan-db"}
 			run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				if name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "__ANANKE_") {
 					return "", errors.New("no route to host")
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
 			orch, _ := newHookOrchestrator(t, cfg, run, run)
 			if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
 				t.Fatalf("expected ssh-auth gate to ignore non-core worker failure, got %v", err)
 			}
 			if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err != nil {
 				t.Fatalf("expected inventory reachability gate to ignore non-core worker failure, got %v", err)
 			}
 		})
 	})
 	t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {
--- a/testing/orchestrator/hooks_access_vault_lifecycle_matrix_test.go
+++ b/testing/orchestrator/hooks_access_vault_lifecycle_matrix_test.go
@ -19,11 +19,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
-// TestHookAccessVaultLifecycleMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart8CoverageClosure runs one orchestration or CLI step.
-// Signature: TestHookAccessVaultLifecycleMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart8CoverageClosure(t *testing.T).
 // Why: closes additional low-coverage branches in access, vault, lifecycle,
 // ingress/service stability, and timesync/inventory orchestration paths.
-func TestHookAccessVaultLifecycleMatrix(t *testing.T) {
+func TestHookGapMatrixPart8CoverageClosure(t *testing.T) {
 	t.Run("access-ssh-and-branch-guard-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.RequireNodeSSHAuth = true
@ -331,11 +331,11 @@ func TestHookAccessVaultLifecycleMatrix(t *testing.T) {
 	})
 }
-// TestHookLifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
+// TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
-// Signature: TestHookLifecycleStartupAutoRestoreBranch(t *testing.T).
+// Signature: TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T).
 // Why: covers Startup's API-failure->auto-restore retry path that is otherwise
 // hard to exercise in deterministic top-level tests.
-func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
+func TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T) {
 	cfg := lifecycleConfig(t)
 	cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
 	cfg.Startup.EtcdRestoreControlPlane = "titan-db"
@ -384,7 +384,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
 		}
 	}
 	orch, _ := newHookOrchestrator(t, cfg, run, run)
-	err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lifecycle-auto-restore"})
+	err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "part8-auto-restore"})
 	if err != nil {
 		t.Fatalf("expected startup auto-restore path success, got %v", err)
 	}
@ -394,7 +394,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
 	cfgBadMode := lifecycleConfig(t)
 	orchBadMode, _ := newHookOrchestrator(t, cfgBadMode, nil, nil)
-	err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "lifecycle", Mode: "unknown-mode"})
+	err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "part8", Mode: "unknown-mode"})
 	if err == nil || !strings.Contains(err.Error(), "unsupported shutdown mode") {
 		t.Fatalf("expected shutdown unsupported-mode branch, got %v", err)
 	}
--- a/testing/orchestrator/hooks_access_flux_endpoint_matrix_test.go
+++ b/testing/orchestrator/hooks_access_flux_endpoint_matrix_test.go
@ -16,11 +16,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
-// TestHookAccessCoordinationEndpointsMatrix runs one orchestration or CLI step.
+// TestHookGapMatrixPart9AccessCoordinationEndpoints runs one orchestration or CLI step.
-// Signature: TestHookAccessCoordinationEndpointsMatrix(t *testing.T).
+// Signature: TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T).
 // Why: closes uncovered statement ranges in access/fluxsource, coordination,
 // and critical-endpoint orchestration helpers.
-func TestHookAccessCoordinationEndpointsMatrix(t *testing.T) {
+func TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T) {
 	t.Run("access-fluxsource-uncovered-ranges", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Shutdown.SSHParallelism = 0
--- a/testing/orchestrator/hooks_ingress_service_matrix_test.go
+++ b/testing/orchestrator/hooks_ingress_service_matrix_test.go
@ -53,48 +53,6 @@ func TestHookIngressServiceMatrix(t *testing.T) {
 		}
 	})
 	t.Run("required-node-labels-skip-ignored-unavailable-nodes", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
 			"titan-09": {
 				"ananke.bstein.dev/harbor-bootstrap": "true",
 			},
 		}
 		cfg.Startup.IgnoreUnavailableNodes = []string{"titan-09"}
 		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 			command := name + " " + strings.Join(args, " ")
 			if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
 				t.Fatalf("expected ignored unavailable node labels to be skipped, got %q", command)
 			}
 			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 		}
 		orch, _ := newHookOrchestrator(t, cfg, run, run)
 		if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
 			t.Fatalf("expected ignored unavailable node label enforcement to be skipped, got %v", err)
 		}
 	})
 	t.Run("required-node-labels-skip-absent-non-core-nodes", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
 			"titan-09": {
 				"ananke.bstein.dev/harbor-bootstrap": "true",
 			},
 		}
 		cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
 		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 			command := name + " " + strings.Join(args, " ")
 			if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
 				return "", errors.New("Error from server (NotFound): nodes \"titan-09\" not found")
 			}
 			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 		}
 		orch, _ := newHookOrchestrator(t, cfg, run, run)
 		if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
 			t.Fatalf("expected absent non-core node label enforcement to be skipped, got %v", err)
 		}
 	})
 	t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
 		tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 			w.WriteHeader(http.StatusOK)
--- a/testing/orchestrator/hooks_lifecycle_deep_matrix_test.go
+++ b/testing/orchestrator/hooks_lifecycle_deep_matrix_test.go
@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"net"
 	"os"
 	"strings"
 	"testing"
 	"time"
@ -124,25 +125,20 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
 	t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {
 		cfg := lifecycleFastConfig(t)
-		cfg.Startup.RequireNodeInventoryReach = false
+		cfg.Startup.ShutdownCooldownSeconds = 1
-		cfg.Startup.ShutdownCooldownSeconds = 5
+		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
-		reads := 0
+			State:     state.IntentShutdownComplete,
-		restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
+			Reason:    "recent",
-			if path != cfg.State.IntentPath {
+			Source:    "test",
-				return state.TestHookReadIntentDefault(path)
+			UpdatedAt: time.Now().UTC(),
-			}
+		}); err != nil {
-			reads++
+			t.Fatalf("seed cooldown intent: %v", err)
-			if reads == 1 {
+		}
-				return state.Intent{
+		go func(intentPath string) {
-					State:     state.IntentShutdownComplete,
+			time.Sleep(150 * time.Millisecond)
-					Reason:    "recent",
+			_ = os.Remove(intentPath)
-					Source:    "test",
+			_ = os.Mkdir(intentPath, 0o755)
-					UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
+		}(cfg.State.IntentPath)
 				}, nil
 			}
 			return state.Intent{}, errors.New("forced reread failure")
 		})
 		t.Cleanup(restoreRead)
 		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
 		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})
 		if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {
@ -152,30 +148,24 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {
 	t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {
 		cfg := lifecycleFastConfig(t)
-		cfg.Startup.RequireNodeInventoryReach = false
+		cfg.Startup.ShutdownCooldownSeconds = 1
-		cfg.Startup.ShutdownCooldownSeconds = 5
+		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
-		reads := 0
+			State:     state.IntentShutdownComplete,
-		restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
+			Reason:    "recent",
-			if path != cfg.State.IntentPath {
+			Source:    "test",
-				return state.TestHookReadIntentDefault(path)
+			UpdatedAt: time.Now().UTC(),
-			}
+		}); err != nil {
-			reads++
+			t.Fatalf("seed cooldown intent: %v", err)
-			if reads == 1 {
+		}
-				return state.Intent{
+		go func(intentPath string) {
-					State:     state.IntentShutdownComplete,
+			time.Sleep(150 * time.Millisecond)
-					Reason:    "recent",
+			_ = state.WriteIntent(intentPath, state.Intent{
 					Source:    "test",
 					UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
 				}, nil
 			}
 			return state.Intent{
 				State:     state.IntentShuttingDown,
 				Reason:    "peer-shutdown",
 				Source:    "test",
 				UpdatedAt: time.Now().UTC(),
-			}, nil
+			})
-		})
+		}(cfg.State.IntentPath)
 		t.Cleanup(restoreRead)
 		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
 		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})
 		if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {
--- a/testing/orchestrator/hooks_scheduling_storm_test.go
+++ b/testing/orchestrator/hooks_scheduling_storm_test.go
@ -1,432 +0,0 @@
 package orchestrator
 import (
 	"context"
 	"errors"
 	"strings"
 	"testing"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/cluster"
 )
 // TestHookSchedulingStormHelpers runs one orchestration or CLI step.
 // Signature: TestHookSchedulingStormHelpers(t *testing.T).
 // Why: keeps scheduling-storm helper coverage in the split top-level testing module
 // required by the repo hygiene contract.
 func TestHookSchedulingStormHelpers(t *testing.T) {
 	if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "ollama-rs", "Deployment", "ollama"); !ok || got != "ai/deployment/ollama" {
 		t.Fatalf("unexpected deployment owner resolution: got=%q ok=%v", got, ok)
 	}
 	if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("storage", "StatefulSet", "nextcloud", "", ""); !ok || got != "storage/statefulset/nextcloud" {
 		t.Fatalf("unexpected statefulset owner resolution: got=%q ok=%v", got, ok)
 	}
 	if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "missing", "", ""); ok || got != "" {
 		t.Fatalf("expected missing replicaset owner lookup to fail, got=%q ok=%v", got, ok)
 	}
 	if got := cluster.TestHookEventObservationCount(3, 9); got != 9 {
 		t.Fatalf("expected series count to win, got %d", got)
 	}
 	if got := cluster.TestHookEventObservationCount(0, 0); got != 1 {
 		t.Fatalf("expected zero-count normalization to 1, got %d", got)
 	}
 	now := time.Now().UTC().Round(time.Second)
 	if got := cluster.TestHookEventLastObservedAt(now, now.Add(-time.Minute), now.Add(-2*time.Minute), now.Add(-3*time.Minute)); !got.Equal(now) {
 		t.Fatalf("expected series timestamp priority, got %s", got)
 	}
 	if got := cluster.TestHookEventLastObservedAt(time.Time{}, now, now.Add(-time.Minute), now.Add(-2*time.Minute)); !got.Equal(now) {
 		t.Fatalf("expected lastTimestamp fallback, got %s", got)
 	}
 	if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, now, now.Add(-time.Minute)); !got.Equal(now) {
 		t.Fatalf("expected eventTime fallback, got %s", got)
 	}
 	if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, time.Time{}, now); !got.Equal(now) {
 		t.Fatalf("expected creationTimestamp fallback, got %s", got)
 	}
 }
 // TestHookSchedulingStormQuarantine runs one orchestration or CLI step.
 // Signature: TestHookSchedulingStormQuarantine(t *testing.T).
 // Why: verifies that only non-core workloads generating real scheduling storms
 // are auto-quarantined, which prevents event/Kine churn from spiking control-plane CPU.
 func TestHookSchedulingStormQuarantine(t *testing.T) {
 	now := time.Now().UTC().Format(time.RFC3339)
 	cfg := lifecycleConfig(t)
 	cfg.Startup.AutoQuarantineSchedulingStorms = true
 	cfg.Startup.SchedulingStormEventThreshold = 30
 	cfg.Startup.SchedulingStormWindowSeconds = 180
 	cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault"}
 	cfg.Startup.IgnoreWorkloadNamespaces = []string{"ignored-ns"}
 	cfg.Startup.IgnoreWorkloads = []string{"monitoring/deployment/ignore-me"}
 	cfg.Startup.IgnoreUnavailableNodes = []string{"titan-22"}
 	scaledOllama := false
 	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		command := name + " " + strings.Join(args, " ")
 		switch {
 		case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
 			return `{"items":[
 				{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}},
 				{"metadata":{"namespace":"vault","name":"vault-0","ownerReferences":[{"kind":"StatefulSet","name":"vault"}]},"spec":{},"status":{"phase":"Pending"}},
 				{"metadata":{"namespace":"ignored-ns","name":"skip-pod","ownerReferences":[{"kind":"ReplicaSet","name":"skip-rs"}]},"spec":{},"status":{"phase":"Pending"}},
 				{"metadata":{"namespace":"monitoring","name":"ignore-me-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignore-me-rs"}]},"spec":{},"status":{"phase":"Pending"}},
 				{"metadata":{"namespace":"monitoring","name":"ignored-node-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignored-node-rs"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Pending"}},
 				{"metadata":{"namespace":"monitoring","name":"running-pod","ownerReferences":[{"kind":"ReplicaSet","name":"running-rs"}]},"spec":{},"status":{"phase":"Running"}}
 			]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
 			return `{"items":[
 				{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}},
 				{"metadata":{"namespace":"ignored-ns","name":"skip-rs","ownerReferences":[{"kind":"Deployment","name":"skip"}]}},
 				{"metadata":{"namespace":"monitoring","name":"ignore-me-rs","ownerReferences":[{"kind":"Deployment","name":"ignore-me"}]}},
 				{"metadata":{"namespace":"monitoring","name":"ignored-node-rs","ownerReferences":[{"kind":"Deployment","name":"ignored-node"}]}},
 				{"metadata":{"namespace":"monitoring","name":"running-rs","ownerReferences":[{"kind":"Deployment","name":"running"}]}}
 			]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
 			return `{"items":[
 				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
 				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"vault","name":"vault-0"},"type":"Warning","reason":"FailedScheduling","count":45},
 				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ignored-ns","name":"skip-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
 				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignore-me-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
 				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignored-node-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
 				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"running-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
 				{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"stale-pod"},"type":"Warning","reason":"FailedScheduling","count":99}
 			]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
 			return `{"items":[
 				{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}},
 				{"kind":"StatefulSet","metadata":{"namespace":"vault","name":"vault"},"spec":{"replicas":1}},
 				{"kind":"Deployment","metadata":{"namespace":"ignored-ns","name":"skip"},"spec":{"replicas":1}},
 				{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignore-me"},"spec":{"replicas":1}},
 				{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignored-node"},"spec":{"replicas":1}},
 				{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"running"},"spec":{"replicas":1}}
 			]}`, nil
 		case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
 			scaledOllama = true
 			return "", nil
 		default:
 			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 		}
 	}
 	orch, _ := newHookOrchestrator(t, cfg, run, run)
 	orch.TestHookBeginStartupReport("scheduling-storm")
 	defer orch.TestHookFinalizeStartupReport(nil)
 	if err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background()); err != nil {
 		t.Fatalf("quarantine scheduling storm workloads: %v", err)
 	}
 	if !scaledOllama {
 		t.Fatalf("expected ollama deployment to be scaled to zero")
 	}
 	progress := readStartupProgress(t, orch)
 	if !strings.Contains(progress, "ollama") {
 		t.Fatalf("expected startup progress to mention ollama quarantine, payload=%s", progress)
 	}
 	if strings.Contains(progress, "vault") || strings.Contains(progress, "ignore-me") || strings.Contains(progress, "ignored-node") {
 		t.Fatalf("expected only the non-core eligible workload to be quarantined, payload=%s", progress)
 	}
 }
 // TestHookSchedulingStormTriggerGuards runs one orchestration or CLI step.
 // Signature: TestHookSchedulingStormTriggerGuards(t *testing.T).
 // Why: covers dry-run/disabled/rate-limit guards so the scheduler-storm auto-heal
 // only activates when the cluster is actually suffering this exact failure mode.
 func TestHookSchedulingStormTriggerGuards(t *testing.T) {
 	cfgDisabled := lifecycleConfig(t)
 	orchDisabled, _ := newHookOrchestrator(t, cfgDisabled, nil, nil)
 	lastAttempt := time.Time{}
 	orchDisabled.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
 	if !lastAttempt.IsZero() {
 		t.Fatalf("expected disabled scheduling-storm trigger to be skipped")
 	}
 	cfgDry := lifecycleConfig(t)
 	cfgDry.Startup.AutoQuarantineSchedulingStorms = true
 	orchDry := newDryRunHookOrchestrator(t, cfgDry, nil)
 	orchDry.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
 	if !lastAttempt.IsZero() {
 		t.Fatalf("expected dry-run scheduling-storm trigger to be skipped")
 	}
 	cfgRate := lifecycleConfig(t)
 	cfgRate.Startup.AutoQuarantineSchedulingStorms = true
 	cfgRate.Startup.SchedulingStormEventThreshold = 5
 	cfgRate.Startup.SchedulingStormWindowSeconds = 60
 	recorder := &commandRecorder{}
 	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		recorder.record(name, args)
 		command := name + " " + strings.Join(args, " ")
 		switch {
 		case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
 			return `{"items":[]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
 			return `{"items":[]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
 			return `{"items":[]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
 			return `{"items":[]}`, nil
 		default:
 			return lifecycleDispatcher(recorder)(ctx, timeout, name, args...)
 		}
 	}
 	orchRate, _ := newHookOrchestrator(t, cfgRate, run, run)
 	lastAttempt = time.Now()
 	orchRate.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
 	if recorder.contains("get pods -A -o json") {
 		t.Fatalf("expected rate-limited scheduling-storm trigger to skip kubectl scans")
 	}
 }
 // TestHookSchedulingStormTriggerAndNoOpBranches runs one orchestration or CLI step.
 // Signature: TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T).
 // Why: raises scheduling-storm branch coverage on the success/no-op paths so the
 // auto-heal only acts on genuine event storms and stays quiet otherwise.
 func TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T) {
 	cfg := lifecycleConfig(t)
 	cfg.Startup.AutoQuarantineSchedulingStorms = true
 	cfg.Startup.SchedulingStormEventThreshold = 0
 	cfg.Startup.SchedulingStormWindowSeconds = 0
 	scanRan := false
 	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		command := name + " " + strings.Join(args, " ")
 		switch {
 		case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
 			scanRan = true
 			return `{"items":[
 				{"metadata":{"namespace":"","name":"missing"}},
 				{"metadata":{"namespace":"monitoring","name":"no-owner"},"spec":{},"status":{"phase":"Pending"}},
 				{"metadata":{"namespace":"monitoring","name":"done","ownerReferences":[{"kind":"ReplicaSet","name":"done-rs"}]},"spec":{},"status":{"phase":"Running"}},
 				{"metadata":{"namespace":"monitoring","name":"zero-replicas","ownerReferences":[{"kind":"ReplicaSet","name":"zero-rs"}]},"spec":{},"status":{"phase":"Pending"}}
 			]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
 			return `{"items":[
 				{"metadata":{"namespace":"","name":"bad-rs"}},
 				{"metadata":{"namespace":"monitoring","name":"done-rs","ownerReferences":[{"kind":"","name":"ignored"}]}},
 				{"metadata":{"namespace":"monitoring","name":"zero-rs","ownerReferences":[{"kind":"Deployment","name":"zero"}]}}
 			]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
 			return `{"items":[
 				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"normal"},"type":"Normal","reason":"FailedScheduling","count":99},
 				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"wrong-reason"},"type":"Warning","reason":"SomeOtherReason","count":99},
 				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Service","namespace":"monitoring","name":"wrong-kind"},"type":"Warning","reason":"FailedScheduling","count":99},
 				{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"old"},"type":"Warning","reason":"FailedScheduling","count":99},
 				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"low-count"},"type":"Warning","reason":"FailedScheduling","count":1},
 				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"missing-pod"},"type":"Warning","reason":"FailedScheduling","count":99},
 				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"done"},"type":"Warning","reason":"FailedScheduling","count":99},
 				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"no-owner"},"type":"Warning","reason":"FailedScheduling","count":99},
 				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"zero-replicas"},"type":"Warning","reason":"FailedScheduling","count":99}
 			]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
 			return `{"items":[
 				{"kind":"","metadata":{"namespace":"monitoring","name":"blank-kind"}},
 				{"kind":"Job","metadata":{"namespace":"monitoring","name":"unsupported"}},
 				{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"zero"},"spec":{"replicas":0}}
 			]}`, nil
 		default:
 			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 		}
 	}
 	orch, _ := newHookOrchestrator(t, cfg, run, run)
 	orch.TestHookBeginStartupReport("scheduling-storm-noop")
 	defer orch.TestHookFinalizeStartupReport(nil)
 	lastAttempt := time.Time{}
 	orch.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
 	if lastAttempt.IsZero() {
 		t.Fatalf("expected successful scheduling-storm trigger to update lastAttempt")
 	}
 	if !scanRan {
 		t.Fatalf("expected scheduling-storm scan to execute")
 	}
 	progress := readStartupProgress(t, orch)
 	if strings.Contains(progress, "quarantined scheduling storm workload") {
 		t.Fatalf("expected no-op scheduling-storm scan to avoid auto-heal output, payload=%s", progress)
 	}
 }
 // TestHookSchedulingStormErrorMatrix runs one orchestration or CLI step.
 // Signature: TestHookSchedulingStormErrorMatrix(t *testing.T).
 // Why: covers malformed/error response branches in the scheduling-storm scan so
 // Ananke can surface precise diagnostics when the API itself is part of the problem.
 func TestHookSchedulingStormErrorMatrix(t *testing.T) {
 	cases := []struct {
 		name    string
 		run     func(context.Context, time.Duration, string, ...string) (string, error)
 		wantErr string
 	}{
 		{
 			name: "pods-query-error",
 			run: func(_ context.Context, _ time.Duration, name string, _ ...string) (string, error) {
 				if name == "kubectl" {
 					return "", errors.New("pods boom")
 				}
 				return "", nil
 			},
 			wantErr: "query pods for scheduling storm scan",
 		},
 		{
 			name: "pods-decode-error",
 			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
 				if name == "kubectl" && strings.Contains(strings.Join(args, " "), "get pods -A -o json") {
 					return "{", nil
 				}
 				return `{"items":[]}`, nil
 			},
 			wantErr: "decode pods for scheduling storm scan",
 		},
 		{
 			name: "replicasets-query-error",
 			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				switch {
 				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
 					return "", errors.New("replicasets boom")
 				default:
 					return "", nil
 				}
 			},
 			wantErr: "query replicasets for scheduling storm scan",
 		},
 		{
 			name: "replicasets-decode-error",
 			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				switch {
 				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
 					return "{", nil
 				default:
 					return `{"items":[]}`, nil
 				}
 			},
 			wantErr: "decode replicasets for scheduling storm scan",
 		},
 		{
 			name: "events-query-error",
 			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				switch {
 				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
 					return "", errors.New("events boom")
 				default:
 					return "", nil
 				}
 			},
 			wantErr: "query events for scheduling storm scan",
 		},
 		{
 			name: "events-decode-error",
 			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				switch {
 				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
 					return "{", nil
 				default:
 					return `{"items":[]}`, nil
 				}
 			},
 			wantErr: "decode events for scheduling storm scan",
 		},
 		{
 			name: "workloads-query-error",
 			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				switch {
 				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
 					return "", errors.New("workloads boom")
 				default:
 					return "", nil
 				}
 			},
 			wantErr: "query workloads for scheduling storm scan",
 		},
 		{
 			name: "workloads-decode-error",
 			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				switch {
 				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
 					return `{"items":[]}`, nil
 				case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
 					return "{", nil
 				default:
 					return "", nil
 				}
 			},
 			wantErr: "decode workloads for scheduling storm scan",
 		},
 	}
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			cfg := lifecycleConfig(t)
 			cfg.Startup.AutoQuarantineSchedulingStorms = true
 			orch, _ := newHookOrchestrator(t, cfg, tc.run, tc.run)
 			err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
 			if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
 				t.Fatalf("expected error containing %q, got %v", tc.wantErr, err)
 			}
 		})
 	}
 }
 // TestHookSchedulingStormScaleError runs one orchestration or CLI step.
 // Signature: TestHookSchedulingStormScaleError(t *testing.T).
 // Why: covers the final error path where Ananke detects a real storm but cannot
 // scale the offending workload down.
 func TestHookSchedulingStormScaleError(t *testing.T) {
 	now := time.Now().UTC().Format(time.RFC3339)
 	cfg := lifecycleConfig(t)
 	cfg.Startup.AutoQuarantineSchedulingStorms = true
 	cfg.Startup.SchedulingStormEventThreshold = 5
 	cfg.Startup.SchedulingStormWindowSeconds = 60
 	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 		command := name + " " + strings.Join(args, " ")
 		switch {
 		case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
 			return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}}]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
 			return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}}]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
 			return `{"items":[{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45}]}`, nil
 		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
 			return `{"items":[{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}}]}`, nil
 		case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
 			return "", errors.New("scale denied")
 		default:
 			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 		}
 	}
 	orch, _ := newHookOrchestrator(t, cfg, run, run)
 	err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
 	if err == nil || !strings.Contains(err.Error(), "scale scheduling storm workload ai/deployment/ollama to 0") {
 		t.Fatalf("expected scale error, got %v", err)
 	}
 }
--- a/testing/orchestrator/hooks_startup_scope_vault_test.go
+++ b/testing/orchestrator/hooks_startup_scope_vault_test.go
@ -1,222 +0,0 @@
 package orchestrator
 import (
 	"context"
 	"errors"
 	"os"
 	"strings"
 	"testing"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/cluster"
 )
 // readStartupProgress runs one orchestration or CLI step.
 // Signature: readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string.
 // Why: startup helper tests need to inspect progress artifacts without reaching
 // into internal package state from the top-level testing module.
 func readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string {
 	t.Helper()
 	payload, err := os.ReadFile(orch.TestHookStartupProgressPath())
 	if err != nil {
 		t.Fatalf("read startup progress: %v", err)
 	}
 	return string(payload)
 }
 // TestHookStartupScopeAndVaultHelpers runs one orchestration or CLI step.
 // Signature: TestHookStartupScopeAndVaultHelpers(t *testing.T).
 // Why: keeps startup-scope and startup-Vault helper branches covered from the
 // split top-level testing module required by the repo hygiene contract.
 func TestHookStartupScopeAndVaultHelpers(t *testing.T) {
 	t.Run("startup-scope-helpers", func(t *testing.T) {
 		nodes := []string{"titan-db", " titan-23 ", "", "titan-24"}
 		if got := cluster.TestHookStartupRequiredNodes(nodes, nil); len(got) != len(nodes) {
 			t.Fatalf("expected passthrough node list, got %v", got)
 		}
 		got := cluster.TestHookStartupRequiredNodes(nodes, []string{"titan-db", "titan-24"})
 		if len(got) != 2 || got[0] != "titan-db" || got[1] != "titan-24" {
 			t.Fatalf("unexpected filtered node list: %v", got)
 		}
 		if !cluster.TestHookContainsNode([]string{"titan-db", " titan-23 "}, " titan-23 ") {
 			t.Fatalf("expected trimmed node membership match")
 		}
 		if cluster.TestHookContainsNode([]string{"titan-db"}, " ") {
 			t.Fatalf("expected blank node probe to be ignored")
 		}
 		cfg := lifecycleConfig(t)
 		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
 		if !orch.TestHookStartupNodeStrictlyRequired("titan-23") {
 			t.Fatalf("expected all nodes to be strict when no recovery scopes are configured")
 		}
 		cfgScoped := lifecycleConfig(t)
 		cfgScoped.Startup.NodeInventoryReachRequiredNodes = []string{"titan-23"}
 		cfgScoped.Startup.NodeSSHAuthRequiredNodes = []string{"titan-24"}
 		cfgScoped.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", " flux-system/gitea "}
 		cfgScoped.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault", " monitoring "}
 		orchScoped, _ := newHookOrchestrator(t, cfgScoped, nil, nil)
 		if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-db") {
 			t.Fatalf("expected control plane to remain strict")
 		}
 		if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-23") {
 			t.Fatalf("expected inventory-scoped node to remain strict")
 		}
 		if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-24") {
 			t.Fatalf("expected ssh-scoped node to remain strict")
 		}
 		if orchScoped.TestHookStartupNodeStrictlyRequired("titan-25") {
 			t.Fatalf("expected non-core worker to stop being strict")
 		}
 		flux := orchScoped.TestHookStartupRequiredFluxKustomizations()
 		if _, ok := flux["flux-system/core"]; !ok {
 			t.Fatalf("expected core flux kustomization in required set: %v", flux)
 		}
 		if _, ok := flux["flux-system/gitea"]; !ok {
 			t.Fatalf("expected trimmed gitea kustomization in required set: %v", flux)
 		}
 		namespaces := orchScoped.TestHookStartupRequiredWorkloadNamespaces()
 		if _, ok := namespaces["vault"]; !ok {
 			t.Fatalf("expected vault namespace in required set: %v", namespaces)
 		}
 		if _, ok := namespaces["monitoring"]; !ok {
 			t.Fatalf("expected trimmed monitoring namespace in required set: %v", namespaces)
 		}
 	})
 	t.Run("startup-vault-helpers", func(t *testing.T) {
 		t.Run("early-vault-unseal-paths", func(t *testing.T) {
 			cfgAPI := lifecycleConfig(t)
 			runAPI := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
 					return "", errors.New("api down")
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
 			orchAPI, _ := newHookOrchestrator(t, cfgAPI, runAPI, runAPI)
 			orchAPI.TestHookBeginStartupReport("startup-vault")
 			orchAPI.TestHookMaybeRunEarlyVaultUnseal(context.Background())
 			if payload := readStartupProgress(t, orchAPI); strings.Contains(payload, "vault-unseal-early") {
 				t.Fatalf("expected no early vault check when api is unavailable, payload=%s", payload)
 			}
 			cfgErr := lifecycleConfig(t)
 			runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				switch {
 				case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
 					return "v1.31.0", nil
 				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
 					return "", errors.New("phase probe failed")
 				default:
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
 			orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
 			orchErr.TestHookBeginStartupReport("startup-vault")
 			orchErr.TestHookMaybeRunEarlyVaultUnseal(context.Background())
 			if payload := readStartupProgress(t, orchErr); !strings.Contains(payload, "deferred early vault unseal") {
 				t.Fatalf("expected early vault auto-heal detail, payload=%s", payload)
 			}
 			cfgDeferred := lifecycleConfig(t)
 			runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				switch {
 				case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
 					return "v1.31.0", nil
 				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
 					return "Pending", nil
 				default:
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
 			orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
 			orchDeferred.TestHookBeginStartupReport("startup-vault")
 			orchDeferred.TestHookMaybeRunEarlyVaultUnseal(context.Background())
 			if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
 				t.Fatalf("expected deferred early vault detail, payload=%s", payload)
 			}
 			cfgSuccess := lifecycleConfig(t)
 			runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				switch {
 				case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
 					return "v1.31.0", nil
 				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
 					return "Running", nil
 				case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
 					return `{"sealed":false,"initialized":true}`, nil
 				default:
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
 			orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
 			orchSuccess.TestHookBeginStartupReport("startup-vault")
 			orchSuccess.TestHookMaybeRunEarlyVaultUnseal(context.Background())
 			if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault-unseal-early"`) || !strings.Contains(payload, `"passed"`) {
 				t.Fatalf("expected successful early vault check, payload=%s", payload)
 			}
 		})
 		t.Run("startup-vault-gate-paths", func(t *testing.T) {
 			cfgErr := lifecycleConfig(t)
 			runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
 					return "", errors.New("phase probe failed")
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
 			orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
 			orchErr.TestHookBeginStartupReport("startup-vault")
 			if err := orchErr.TestHookRunStartupVaultUnsealGate(context.Background()); err == nil || !strings.Contains(err.Error(), "phase probe failed") {
 				t.Fatalf("expected startup vault gate error, got %v", err)
 			}
 			cfgDeferred := lifecycleConfig(t)
 			runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
 					return "Pending", nil
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
 			orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
 			orchDeferred.TestHookBeginStartupReport("startup-vault")
 			if err := orchDeferred.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
 				t.Fatalf("expected deferred startup vault gate to succeed, got %v", err)
 			}
 			if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
 				t.Fatalf("expected deferred startup vault detail, payload=%s", payload)
 			}
 			cfgSuccess := lifecycleConfig(t)
 			runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
 				command := name + " " + strings.Join(args, " ")
 				switch {
 				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
 					return "Running", nil
 				case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
 					return `{"sealed":false,"initialized":true}`, nil
 				default:
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
 			orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
 			orchSuccess.TestHookBeginStartupReport("startup-vault")
 			if err := orchSuccess.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
 				t.Fatalf("expected successful startup vault gate, got %v", err)
 			}
 			if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault is unsealed"`) {
 				t.Fatalf("expected successful startup vault detail, payload=%s", payload)
 			}
 		})
 	})
 }
--- a/testing/state/state_testhooks_quality_test.go
+++ b/testing/state/state_testhooks_quality_test.go
@ -24,36 +24,12 @@ func TestStateTestHookOverrideSetters(t *testing.T) {
 	}
 	restoreWriteNil()
 	restoreReadNil := state.TestHookSetReadIntentOverride(nil)
 	readAfterNil, err := state.ReadIntent(intentPath)
 	if err != nil || readAfterNil.State != state.IntentNormal {
 		t.Fatalf("expected default read intent path after nil override, got %v / %v", readAfterNil, err)
 	}
 	restoreReadNil()
 	readOverrideCalled := false
 	restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
 		readOverrideCalled = true
 		return state.Intent{}, errors.New("forced read override")
 	})
 	_, err = state.ReadIntent(intentPath)
 	if err == nil || !strings.Contains(err.Error(), "forced read override") {
 		t.Fatalf("expected forced read override error, got %v", err)
 	}
 	if !readOverrideCalled {
 		t.Fatalf("expected read override to be invoked")
 	}
 	restoreRead()
 	if _, err := state.TestHookReadIntentDefault(intentPath); err != nil {
 		t.Fatalf("expected explicit default read helper to succeed, got %v", err)
 	}
 	writeOverrideCalled := false
 	restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
 		writeOverrideCalled = true
 		return errors.New("forced write override")
 	})
-	err = state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
+	err := state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
 	if err == nil || !strings.Contains(err.Error(), "forced write override") {
 		t.Fatalf("expected forced write override error, got %v", err)
 	}