64 changed files with 1157 additions and 4539 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,4 @@
 /bin/
-/build/
 /dist/
-internal/state/.corrupt-*
 *.log
 *.tmp
--- a/201
+++ b/201
@ -1,59 +1,25 @@
 pipeline {
  agent {
    kubernetes {
+      label 'ananke-quality'
      defaultContainer 'go-tester'
      yaml """
 apiVersion: v1
 kind: Pod
 spec:
  nodeSelector:
-    hardware: rpi5
    kubernetes.io/arch: arm64
    node-role.kubernetes.io/worker: "true"
-  affinity:
-    nodeAffinity:
-      requiredDuringSchedulingIgnoredDuringExecution:
-        nodeSelectorTerms:
-          - matchExpressions:
-              - key: kubernetes.io/hostname
-                operator: NotIn
-                values:
-                  - titan-06
-      preferredDuringSchedulingIgnoredDuringExecution:
-        - weight: 100
-          preference:
-            matchExpressions:
-              - key: kubernetes.io/hostname
-                operator: NotIn
-                values:
-                  - titan-13
-                  - titan-15
-                  - titan-17
-                  - titan-19
-  topologySpreadConstraints:
-    - maxSkew: 1
-      topologyKey: kubernetes.io/hostname
-      whenUnsatisfiable: ScheduleAnyway
-      labelSelector:
-        matchLabels:
-          jenkins/jenkins-jenkins-agent: "true"
  containers:
    - name: go-tester
-      image: registry.bstein.dev/bstein/golang:1.25-bookworm
+      image: golang:1.25-bookworm
      command: ["cat"]
      tty: true
      volumeMounts:
        - name: workspace-volume
          mountPath: /home/jenkins/agent
    - name: publisher
-      image: registry.bstein.dev/bstein/python:3.12-slim
-      command: ["cat"]
-      tty: true
-      volumeMounts:
-        - name: workspace-volume
-          mountPath: /home/jenkins/agent
-    - name: quality-tools
-      image: registry.bstein.dev/bstein/quality-tools:sonar8.0.1-trivy0.70.0-db20260422-arm64
+      image: python:3.12-slim
      command: ["cat"]
      tty: true
      volumeMounts:
@ -69,13 +35,7 @@ spec:
  environment {
    SUITE_NAME = 'ananke'
    PUSHGATEWAY_URL = 'http://platform-quality-gateway.monitoring.svc.cluster.local:9091'
-    SONARQUBE_HOST_URL = 'http://sonarqube.quality.svc.cluster.local:9000'
-    SONARQUBE_PROJECT_KEY = 'ananke'
-    SONARQUBE_TOKEN = credentials('sonarqube-token')
-    QUALITY_GATE_SONARQUBE_ENFORCE = '1'
    QUALITY_GATE_SONARQUBE_REPORT = 'build/sonarqube-quality-gate.json'
-    QUALITY_GATE_IRONBANK_ENFORCE = '1'
-    QUALITY_GATE_IRONBANK_REQUIRED = '0'
    QUALITY_GATE_IRONBANK_REPORT = 'build/ironbank-compliance.json'
  }

@ -97,27 +57,6 @@ spec:

    stage('Collect SonarQube evidence') {
      steps {
-        container('quality-tools') {
-          sh '''#!/usr/bin/env bash
-            set -euo pipefail
-            mkdir -p build
-            args=(
-              "-Dsonar.host.url=${SONARQUBE_HOST_URL}"
-              "-Dsonar.login=${SONARQUBE_TOKEN}"
-              "-Dsonar.projectKey=${SONARQUBE_PROJECT_KEY}"
-              "-Dsonar.projectName=${SONARQUBE_PROJECT_KEY}"
-              "-Dsonar.sources=."
-              "-Dsonar.exclusions=**/.git/**,**/build/**,**/dist/**,**/node_modules/**,**/.venv/**,**/__pycache__/**,**/coverage/**,**/test-results/**,**/playwright-report/**"
-              "-Dsonar.test.inclusions=**/tests/**,**/testing/**,**/*_test.go,**/*.test.ts,**/*.test.tsx,**/*.spec.ts,**/*.spec.tsx"
-            )
-            [ -f build/coverage.out ] && args+=("-Dsonar.go.coverage.reportPaths=build/coverage.out")
-            set +e
-            sonar-scanner "${args[@]}" | tee build/sonar-scanner.log
-            rc=${PIPESTATUS[0]}
-            set -e
-            printf '%s\n' "${rc}" > build/sonarqube-analysis.rc
-          '''
-        }
        container('publisher') {
          sh '''
            set -eu
@ -156,34 +95,6 @@ PY

    stage('Collect Supply Chain evidence') {
      steps {
-        container('quality-tools') {
-          sh '''#!/usr/bin/env bash
-            set -euo pipefail
-            mkdir -p build
-            set +e
-            trivy fs --cache-dir "${TRIVY_CACHE_DIR}" --skip-db-update --timeout 5m --no-progress --format json --output build/trivy-fs.json --scanners vuln,secret,misconfig --severity HIGH,CRITICAL .
-            trivy_rc=$?
-            set -e
-            if [ ! -s build/trivy-fs.json ]; then
-              cat > build/ironbank-compliance.json <<EOF
-{"status":"failed","compliant":false,"scanner":"trivy","scan_type":"filesystem","error":"trivy did not produce JSON output","trivy_rc":${trivy_rc}}
-EOF
-              exit 0
-            fi
-            critical="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="CRITICAL")] | length' build/trivy-fs.json)"
-            high="$(jq '[.Results[]? | .Vulnerabilities[]? | select(.Severity=="HIGH")] | length' build/trivy-fs.json)"
-            secrets="$(jq '[.Results[]? | .Secrets[]?] | length' build/trivy-fs.json)"
-            misconfigs="$(jq '[.Results[]? | .Misconfigurations[]? | select(.Status=="FAIL" and (.Severity=="CRITICAL" or .Severity=="HIGH"))] | length' build/trivy-fs.json)"
-            status=ok
-            compliant=true
-            if [ "${critical}" -gt 0 ] || [ "${secrets}" -gt 0 ] || [ "${misconfigs}" -gt 0 ]; then
-              status=failed
-              compliant=false
-            fi
-            jq -n --arg status "${status}" --argjson compliant "${compliant}" --argjson critical "${critical}" --argjson high "${high}" --argjson secrets "${secrets}" --argjson misconfigs "${misconfigs}" --argjson trivy_rc "${trivy_rc}" \
-              '{status:$status, compliant:$compliant, category:"artifact_security", scan_type:"filesystem", scanner:"trivy", critical_vulnerabilities:$critical, high_vulnerabilities:$high, secrets:$secrets, high_or_critical_misconfigurations:$misconfigs, trivy_rc:$trivy_rc, high_vulnerability_policy:"observe"}' > build/ironbank-compliance.json
-          '''
-        }
        container('publisher') {
          sh '''
            set -eu
@ -241,25 +152,13 @@ PY
            failed_runs="$(awk -F= '$1=="failed"{print $2}' build/quality-gate.state 2>/dev/null | tail -n1)"
            [ -n "${ok_runs}" ] || ok_runs=0
            [ -n "${failed_runs}" ] || failed_runs=0
-            coverage_percent="$(python3 - <<'PY'
-import re
-from pathlib import Path
-
-log_path = Path("build/quality-gate.out")
-text = log_path.read_text(encoding="utf-8", errors="ignore") if log_path.exists() else ""
-values = [float(match.group(1)) for match in re.finditer(r"([0-9]+(?:\\.[0-9]+)?)%", text)]
-print(values[-1] if values else 0.0)
-PY
-)"
-            printf '%s\n' "${coverage_percent}" > build/coverage-percent.txt
            python3 scripts/publish_quality_metrics.py \
              --pushgateway-url "${PUSHGATEWAY_URL}" \
              --job-name platform-quality-ci \
              --suite "${SUITE_NAME}" \
              --trigger jenkins \
              --local-ok "${ok_runs}" \
-              --local-failed "${failed_runs}" \
-              --coverage-percent-file build/coverage-percent.txt
+              --local-failed "${failed_runs}"
          '''
        }
      }
@ -270,95 +169,7 @@ PY
        container('publisher') {
          sh '''
            set -eu
-            gate_rc="$(cat build/quality-gate.rc 2>/dev/null || echo 1)"
-            fail=0
-            if [ "${gate_rc}" -ne 0 ]; then
-              echo "quality gate failed with rc=${gate_rc}" >&2
-              fail=1
-            fi
-
-            enabled() {
-              case "$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" in
-                1|true|yes|on) return 0 ;;
-                *) return 1 ;;
-              esac
-            }
-
-            if enabled "${QUALITY_GATE_SONARQUBE_ENFORCE:-1}"; then
-              sonar_status="$(python3 - <<'PY'
-import json
-from pathlib import Path
-
-path = Path("build/sonarqube-quality-gate.json")
-if not path.exists():
-    print("missing")
-    raise SystemExit(0)
-try:
-    payload = json.loads(path.read_text(encoding="utf-8"))
-except Exception:  # noqa: BLE001
-    print("error")
-    raise SystemExit(0)
-status = (payload.get("status") or payload.get("projectStatus", {}).get("status") or payload.get("qualityGate", {}).get("status") or "").strip().lower()
-print(status or "missing")
-PY
-)"
-              case "${sonar_status}" in
-                ok|pass|passed|success) ;;
-                *)
-                  echo "sonarqube gate failed: ${sonar_status}" >&2
-                  fail=1
-                  ;;
-              esac
-            fi
-
-            ironbank_required="${QUALITY_GATE_IRONBANK_REQUIRED:-0}"
-            if [ "${PUBLISH_IMAGES:-false}" = "true" ]; then
-              ironbank_required=1
-            fi
-            if enabled "${QUALITY_GATE_IRONBANK_ENFORCE:-1}"; then
-              supply_status="$(python3 - <<'PY'
-import json
-from pathlib import Path
-
-path = Path("build/ironbank-compliance.json")
-if not path.exists():
-    print("missing")
-    raise SystemExit(0)
-try:
-    payload = json.loads(path.read_text(encoding="utf-8"))
-except Exception:  # noqa: BLE001
-    print("error")
-    raise SystemExit(0)
-compliant = payload.get("compliant")
-if compliant is True:
-    print("ok")
-elif compliant is False:
-    print("failed")
-else:
-    status = str(payload.get("status") or payload.get("result") or payload.get("compliance") or "").strip().lower()
-    print(status or "missing")
-PY
-)"
-              case "${supply_status}" in
-                ok|pass|passed|success|compliant) ;;
-                not_applicable|na|n/a)
-                  if enabled "${ironbank_required}"; then
-                    echo "supply chain gate required but status=${supply_status}" >&2
-                    fail=1
-                  fi
-                  ;;
-                *)
-                  if enabled "${ironbank_required}"; then
-                    echo "supply chain gate failed: ${supply_status}" >&2
-                    fail=1
-                  else
-                    echo "supply chain gate not passing (${supply_status}) but not required for this run" >&2
-                  fi
-                  ;;
-              esac
-            fi
-
-            exit "${fail}"
+            test "$(cat build/quality-gate.rc 2>/dev/null || echo 1)" -eq 0
          '''
        }
      }
@ -367,7 +178,7 @@ PY

  post {
    always {
-      archiveArtifacts artifacts: 'build/*.json,build/*.out,build/*.rc,build/*.txt,build/*.xml', allowEmptyArchive: true, fingerprint: true
+      archiveArtifacts artifacts: 'build/quality-gate.out,build/quality-gate.rc', allowEmptyArchive: true, fingerprint: true
    }
  }
 }
--- a/README.md
+++ b/README.md
@ -97,15 +97,10 @@ Primary config path:
 Keep these fields accurate:
 - `expected_flux_source_url`
 - `expected_flux_branch`
- `startup.service_checklist_explicit_only`
 - `startup.service_checklist`
 - `startup.critical_service_endpoints`
 - `startup.require_ingress_checklist`
 - `startup.require_node_inventory_reachability`
- `startup.node_inventory_reachability_required_nodes`
- `startup.node_ssh_auth_required_nodes`
- `startup.flux_health_required_kustomizations`
- `startup.workload_convergence_required_namespaces`
 - `startup.ignore_unavailable_nodes`
 - `coordination.role`
 - `coordination.peer_hosts`
@ -139,10 +134,9 @@ Installer behavior:

 When adding nodes or services:
 1. Update inventory and node mapping in config.
-2. Keep the explicit service checklist focused on the core services that must come back during an outage.
-3. Keep `*_required_*` startup scopes aligned with the same core set so optional stacks do not block bootstrap.
-4. Add/adjust ingress expectations for exposed services.
-5. Use temporary ignores only when truly intentional, then remove them.
-6. Run `scripts/quality_gate.sh` before host deployment.
+2. Add/adjust service checklist entries for anything user-facing or critical.
+3. Add/adjust ingress expectations for exposed services.
+4. Use temporary ignores only when truly intentional, then remove them.
+5. Run `scripts/quality_gate.sh` before host deployment.

 Recovery quality should improve over time: every drill should reduce manual work in the next drill.
--- a/configs/ananke.example.yaml
+++ b/configs/ananke.example.yaml
@ -51,7 +51,6 @@ startup:
  require_node_inventory_reachability: true
  node_inventory_reachability_wait_seconds: 300
  node_inventory_reachability_poll_seconds: 5
-  node_inventory_reachability_required_nodes: []
  required_node_labels:
    titan-09:
      ananke.bstein.dev/harbor-bootstrap: "true"
@ -91,7 +90,6 @@ startup:
    admin_secret_name: keycloak-admin
    admin_secret_username_key: username
    admin_secret_password_key: password
-  service_checklist_explicit_only: false
  service_checklist:
    - name: gitea-api
      url: https://scm.bstein.dev/api/healthz
@ -136,26 +134,18 @@ startup:
  require_node_ssh_auth: true
  node_ssh_auth_wait_seconds: 240
  node_ssh_auth_poll_seconds: 5
-  node_ssh_auth_required_nodes: []
  require_flux_health: true
  flux_health_wait_seconds: 900
  flux_health_poll_seconds: 5
-  flux_health_required_kustomizations: []
  ignore_flux_kustomizations: []
  require_workload_convergence: true
  workload_convergence_wait_seconds: 900
  workload_convergence_poll_seconds: 5
-  workload_convergence_required_namespaces: []
  ignore_workload_namespaces: []
  ignore_workloads: []
  ignore_unavailable_nodes: []
  auto_recycle_stuck_pods: true
-  auto_quarantine_scheduling_storms: false
-  scheduling_storm_event_threshold: 30
-  scheduling_storm_window_seconds: 180
  stuck_pod_grace_seconds: 180
-  post_start_auto_heal_seconds: 60
-  dead_node_cleanup_grace_seconds: 300
  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
  vault_unseal_breakglass_command: ""
  vault_unseal_breakglass_timeout_seconds: 15
@ -180,7 +170,6 @@ ups:
      target: pyrphoros@localhost
  poll_seconds: 5
  runtime_safety_factor: 1.25
-  on_battery_grace_seconds: 90
  debounce_count: 3
  telemetry_timeout_seconds: 90
 coordination:
--- a/configs/ananke.tethys.yaml
+++ b/configs/ananke.tethys.yaml
@ -117,52 +117,8 @@ startup:
  require_node_inventory_reachability: true
  node_inventory_reachability_wait_seconds: 300
  node_inventory_reachability_poll_seconds: 5
-  node_inventory_reachability_required_nodes:
-    - titan-0a
-    - titan-0b
-    - titan-0c
  required_node_labels:
-    titan-04:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-05:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-06:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-07:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-08:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-11:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-12:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-13:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-14:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-15:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-17:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-18:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-19:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
    titan-09:
-      node-role.kubernetes.io/worker: "true"
      ananke.bstein.dev/harbor-bootstrap: "true"
  require_time_sync: true
  time_sync_wait_seconds: 240
@ -200,7 +156,6 @@ startup:
    admin_secret_name: keycloak-admin
    admin_secret_username_key: username
    admin_secret_password_key: password
-  service_checklist_explicit_only: true
  service_checklist:
    - name: gitea-api
      url: https://scm.bstein.dev/api/healthz
@ -245,49 +200,18 @@ startup:
  require_node_ssh_auth: true
  node_ssh_auth_wait_seconds: 240
  node_ssh_auth_poll_seconds: 5
-  node_ssh_auth_required_nodes:
-    - titan-0a
-    - titan-0b
-    - titan-0c
  require_flux_health: true
  flux_health_wait_seconds: 900
  flux_health_poll_seconds: 5
-  flux_health_required_kustomizations:
-    - flux-system/core
-    - flux-system/helm
-    - flux-system/traefik
-    - flux-system/cert-manager
-    - flux-system/longhorn
-    - flux-system/vault-csi
-    - flux-system/vault-injector
-    - flux-system/postgres
-    - flux-system/vault
-    - flux-system/keycloak
-    - flux-system/oauth2-proxy
-    - flux-system/gitea
-    - flux-system/monitoring
-    - flux-system/harbor
  ignore_flux_kustomizations: []
  require_workload_convergence: true
  workload_convergence_wait_seconds: 900
  workload_convergence_poll_seconds: 5
-  workload_convergence_required_namespaces:
-    - vault
-    - postgres
-    - sso
-    - gitea
-    - monitoring
-    - harbor
  ignore_workload_namespaces: []
  ignore_workloads: []
  ignore_unavailable_nodes: []
  auto_recycle_stuck_pods: true
-  auto_quarantine_scheduling_storms: true
-  scheduling_storm_event_threshold: 30
-  scheduling_storm_window_seconds: 180
  stuck_pod_grace_seconds: 180
-  post_start_auto_heal_seconds: 60
-  dead_node_cleanup_grace_seconds: 300
  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
  vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
  vault_unseal_breakglass_timeout_seconds: 15
@ -311,7 +235,6 @@ ups:
      target: statera@localhost
  poll_seconds: 5
  runtime_safety_factor: 1.25
-  on_battery_grace_seconds: 90
  debounce_count: 3
  telemetry_timeout_seconds: 90
 coordination:
--- a/configs/ananke.titan-db.yaml
+++ b/configs/ananke.titan-db.yaml
@ -117,52 +117,8 @@ startup:
  require_node_inventory_reachability: true
  node_inventory_reachability_wait_seconds: 300
  node_inventory_reachability_poll_seconds: 5
-  node_inventory_reachability_required_nodes:
-    - titan-0a
-    - titan-0b
-    - titan-0c
  required_node_labels:
-    titan-04:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-05:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-06:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-07:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-08:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-11:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-12:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-13:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-14:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-15:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-17:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-18:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
-    titan-19:
-      node-role.kubernetes.io/worker: "true"
-      longhorn-host: "true"
    titan-09:
-      node-role.kubernetes.io/worker: "true"
      ananke.bstein.dev/harbor-bootstrap: "true"
  require_time_sync: true
  time_sync_wait_seconds: 240
@ -200,7 +156,6 @@ startup:
    admin_secret_name: keycloak-admin
    admin_secret_username_key: username
    admin_secret_password_key: password
-  service_checklist_explicit_only: true
  service_checklist:
    - name: gitea-api
      url: https://scm.bstein.dev/api/healthz
@ -245,49 +200,18 @@ startup:
  require_node_ssh_auth: true
  node_ssh_auth_wait_seconds: 240
  node_ssh_auth_poll_seconds: 5
-  node_ssh_auth_required_nodes:
-    - titan-0a
-    - titan-0b
-    - titan-0c
  require_flux_health: true
  flux_health_wait_seconds: 900
  flux_health_poll_seconds: 5
-  flux_health_required_kustomizations:
-    - flux-system/core
-    - flux-system/helm
-    - flux-system/traefik
-    - flux-system/cert-manager
-    - flux-system/longhorn
-    - flux-system/vault-csi
-    - flux-system/vault-injector
-    - flux-system/postgres
-    - flux-system/vault
-    - flux-system/keycloak
-    - flux-system/oauth2-proxy
-    - flux-system/gitea
-    - flux-system/monitoring
-    - flux-system/harbor
  ignore_flux_kustomizations: []
  require_workload_convergence: true
  workload_convergence_wait_seconds: 900
  workload_convergence_poll_seconds: 5
-  workload_convergence_required_namespaces:
-    - vault
-    - postgres
-    - sso
-    - gitea
-    - monitoring
-    - harbor
  ignore_workload_namespaces: []
  ignore_workloads: []
  ignore_unavailable_nodes: []
  auto_recycle_stuck_pods: true
-  auto_quarantine_scheduling_storms: true
-  scheduling_storm_event_threshold: 30
-  scheduling_storm_window_seconds: 180
  stuck_pod_grace_seconds: 180
-  post_start_auto_heal_seconds: 60
-  dead_node_cleanup_grace_seconds: 300
  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key
  vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'"
  vault_unseal_breakglass_timeout_seconds: 15
@ -311,7 +235,6 @@ ups:
      target: pyrphoros@localhost
  poll_seconds: 5
  runtime_safety_factor: 1.25
-  on_battery_grace_seconds: 90
  debounce_count: 3
  telemetry_timeout_seconds: 90
 coordination:
--- a/internal/cluster/orchestrator_access_fluxsource.go
+++ b/internal/cluster/orchestrator_access_fluxsource.go
@ -77,7 +77,7 @@ func (o *Orchestrator) waitForNodeSSHAuth(ctx context.Context, nodes []string) e
 	ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	seen := map[string]struct{}{}
 	targets := make([]string, 0, len(nodes))
-	for _, node := range startupRequiredNodes(nodes, o.cfg.Startup.NodeSSHAuthRequiredNodes) {
+	for _, node := range nodes {
 		node = strings.TrimSpace(node)
 		if node == "" {
 			continue
--- a/internal/cluster/orchestrator_autorepair.go
+++ b/internal/cluster/orchestrator_autorepair.go
@ -1,288 +0,0 @@
-package cluster
-
-import (
-	"context"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"strings"
-	"time"
-)
-
-type nodeReadyList struct {
-	Items []struct {
-		Metadata struct {
-			Name string `json:"name"`
-		} `json:"metadata"`
-		Status struct {
-			Conditions []struct {
-				Type   string `json:"type"`
-				Status string `json:"status"`
-			} `json:"conditions"`
-		} `json:"status"`
-	} `json:"items"`
-}
-
-type podDeleteList struct {
-	Items []struct {
-		Metadata struct {
-			Namespace         string     `json:"namespace"`
-			Name              string     `json:"name"`
-			DeletionTimestamp *time.Time `json:"deletionTimestamp"`
-		} `json:"metadata"`
-		Spec struct {
-			NodeName string `json:"nodeName"`
-		} `json:"spec"`
-	} `json:"items"`
-}
-
-// RunPostStartAutoHeal runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error.
-// Why: gives the long-running daemon a narrow, testable repair entrypoint for
-// post-start drift without rerunning the full startup flow.
-func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error {
-	return o.postStartAutoHeal(ctx)
-}
-
-// postStartAutoHeal runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error.
-// Why: centralizes bounded post-start repair actions so recurring outage
-// patterns only trigger the specific remediation they need.
-func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error {
-	if o.runner.DryRun {
-		return nil
-	}
-
-	errs := []string{}
-	requestReconcile := false
-
-	if err := o.ensureRequiredNodeLabels(ctx); err != nil {
-		errs = append(errs, fmt.Sprintf("required node labels: %v", err))
-	}
-
-	vaultRecovered, err := o.autoRecoverSealedVault(ctx)
-	if err != nil {
-		errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err))
-	} else if vaultRecovered {
-		requestReconcile = true
-		if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil {
-			errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err))
-		}
-	}
-
-	cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx)
-	if err != nil {
-		errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err))
-	} else if cleaned > 0 {
-		requestReconcile = true
-	}
-
-	if requestReconcile {
-		o.bestEffort("request flux reconcile after post-start auto-heal", func() error {
-			return o.requestFluxReconcile(ctx)
-		})
-	}
-
-	if len(errs) > 0 {
-		return errors.New(strings.Join(errs, "; "))
-	}
-	return nil
-}
-
-// autoRecoverSealedVault runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error).
-// Why: lets the daemon repair a later Vault reseal without waiting for a new
-// bootstrap run.
-func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) {
-	if o.runner.DryRun {
-		return false, nil
-	}
-
-	phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
-	if err != nil {
-		if isNotFoundErr(err) {
-			return false, nil
-		}
-		return false, fmt.Errorf("vault pod phase check failed: %w", err)
-	}
-	if strings.TrimSpace(phase) != "Running" {
-		return false, nil
-	}
-
-	sealed, err := o.vaultSealed(ctx)
-	if err != nil {
-		return false, err
-	}
-	if !sealed {
-		return false, nil
-	}
-
-	o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery")
-	if err := o.ensureVaultUnsealed(ctx); err != nil {
-		return false, err
-	}
-	return true, nil
-}
-
-// rerunVaultK8sAuthConfigJob runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error.
-// Why: post-unseal Vault recovery needs the auth-config job retriggered so
-// downstream secret consumers stop carrying stale failures from the sealed window.
-func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error {
-	if o.runner.DryRun {
-		return nil
-	}
-	jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix())
-	if _, err := o.kubectl(
-		ctx,
-		25*time.Second,
-		"-n", "vault",
-		"create", "job",
-		"--from=cronjob/vault-k8s-auth-config",
-		jobName,
-	); err != nil {
-		return fmt.Errorf("create job %s: %w", jobName, err)
-	}
-	o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName)
-	return nil
-}
-
-// cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error).
-// Why: dead nodes can strand terminating pods indefinitely, so the daemon should
-// clear only that narrow failure class instead of leaving garbage behind forever.
-func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) {
-	if o.runner.DryRun {
-		return 0, nil
-	}
-
-	unavailable, err := o.unavailableNodeSet(ctx)
-	if err != nil {
-		return 0, err
-	}
-	if len(unavailable) == 0 {
-		return 0, nil
-	}
-
-	out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
-	if err != nil {
-		return 0, fmt.Errorf("query pods: %w", err)
-	}
-	var pods podDeleteList
-	if err := json.Unmarshal([]byte(out), &pods); err != nil {
-		return 0, fmt.Errorf("decode pods: %w", err)
-	}
-
-	grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second
-	now := time.Now()
-	count := 0
-	for _, item := range pods.Items {
-		if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" {
-			continue
-		}
-		if _, badNode := unavailable[item.Spec.NodeName]; !badNode {
-			continue
-		}
-		if now.Sub(*item.Metadata.DeletionTimestamp) < grace {
-			continue
-		}
-		o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName)
-		if _, err := o.kubectl(
-			ctx,
-			20*time.Second,
-			"-n", item.Metadata.Namespace,
-			"delete", "pod", item.Metadata.Name,
-			"--grace-period=0",
-			"--force",
-			"--wait=false",
-		); err != nil && !isNotFoundErr(err) {
-			return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err)
-		}
-		count++
-	}
-	if count > 0 {
-		o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count)
-	}
-	return count, nil
-}
-
-// unavailableNodeSet runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error).
-// Why: isolates Ready-condition parsing so dead-node cleanup stays targeted.
-func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) {
-	out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
-	if err != nil {
-		return nil, fmt.Errorf("query nodes: %w", err)
-	}
-	var nodes nodeReadyList
-	if err := json.Unmarshal([]byte(out), &nodes); err != nil {
-		return nil, fmt.Errorf("decode nodes: %w", err)
-	}
-
-	unavailable := map[string]struct{}{}
-	for _, item := range nodes.Items {
-		ready := ""
-		for _, cond := range item.Status.Conditions {
-			if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
-				ready = strings.TrimSpace(cond.Status)
-				break
-			}
-		}
-		if ready != "True" {
-			unavailable[item.Metadata.Name] = struct{}{}
-		}
-	}
-	return unavailable, nil
-}
-
-// requestFluxReconcile runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error.
-// Why: post-start repairs need a lightweight way to refresh GitOps health
-// without reusing the broader startup flux-resume flow.
-func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error {
-	if o.runner.DryRun {
-		return nil
-	}
-
-	now := time.Now().UTC().Format(time.RFC3339)
-	if _, err := o.kubectl(
-		ctx,
-		25*time.Second,
-		"-n", "flux-system",
-		"annotate", "gitrepository", "flux-system",
-		"reconcile.fluxcd.io/requestedAt="+now,
-		"--overwrite",
-	); err != nil {
-		return fmt.Errorf("annotate flux source reconcile: %w", err)
-	}
-	if _, err := o.kubectl(
-		ctx,
-		25*time.Second,
-		"-n", "flux-system",
-		"annotate",
-		"kustomizations.kustomize.toolkit.fluxcd.io",
-		"--all",
-		"reconcile.fluxcd.io/requestedAt="+now,
-		"--overwrite",
-	); err != nil {
-		return fmt.Errorf("annotate flux kustomizations reconcile: %w", err)
-	}
-	if _, err := o.kubectl(
-		ctx,
-		25*time.Second,
-		"annotate",
-		"--all-namespaces",
-		"helmreleases.helm.toolkit.fluxcd.io",
-		"--all",
-		"reconcile.fluxcd.io/requestedAt="+now,
-		"--overwrite",
-	); err != nil {
-		o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err)
-	}
-	if o.runOverride == nil && o.runner.CommandExists("flux") {
-		if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil {
-			o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err)
-		}
-	}
-	return nil
-}
--- a/internal/cluster/orchestrator_autorepair_cleanup_test.go
+++ b/internal/cluster/orchestrator_autorepair_cleanup_test.go
@ -1,296 +0,0 @@
-package cluster
-
-import (
-	"context"
-	"errors"
-	"io"
-	"log"
-	"os"
-	"path/filepath"
-	"strings"
-	"testing"
-	"time"
-
-	"scm.bstein.dev/bstein/ananke/internal/config"
-	"scm.bstein.dev/bstein/ananke/internal/execx"
-	"scm.bstein.dev/bstein/ananke/internal/state"
-)
-
-// TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step.
-// Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T).
-// Why: cleanup on dead nodes must be selective so Ananke only force-deletes the
-// truly stranded pods and tolerates already-gone objects.
-func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) {
-	t.Run("dry run skips", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
-		orch.runner.DryRun = true
-		count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
-		if err != nil || count != 0 {
-			t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err)
-		}
-	})
-
-	t.Run("selective cleanup tolerates not found", func(t *testing.T) {
-		oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
-		recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339)
-		orch := buildOrchestratorWithStubs(t, config.Config{
-			Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
-		}, []commandStub{
-			{
-				match: matchContains("kubectl", "get nodes -o json"),
-				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
-			},
-			{
-				match: matchContains("kubectl", "get pods -A -o json"),
-				out: `{"items":[` +
-					`{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` +
-					`{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` +
-					`{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` +
-					`{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`,
-			},
-			{
-				match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
-				err:   errors.New("pod old-stale not found"),
-			},
-		})
-
-		count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
-		if err != nil {
-			t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err)
-		}
-		if count != 1 {
-			t.Fatalf("expected one cleaned pod, got %d", count)
-		}
-	})
-
-	t.Run("query and decode errors surface", func(t *testing.T) {
-		queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "get nodes -o json"),
-				err:   errors.New("nodes failed"),
-			},
-		})
-		if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") {
-			t.Fatalf("expected node query error, got %v", err)
-		}
-
-		decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "get nodes -o json"),
-				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`,
-			},
-			{
-				match: matchContains("kubectl", "get pods -A -o json"),
-				out:   `{bad json`,
-			},
-		})
-		if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") {
-			t.Fatalf("expected pod decode error, got %v", err)
-		}
-	})
-
-	t.Run("delete hard error surfaces", func(t *testing.T) {
-		oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
-		orch := buildOrchestratorWithStubs(t, config.Config{
-			Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300},
-		}, []commandStub{
-			{
-				match: matchContains("kubectl", "get nodes -o json"),
-				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`,
-			},
-			{
-				match: matchContains("kubectl", "get pods -A -o json"),
-				out:   `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`,
-			},
-			{
-				match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"),
-				err:   errors.New("delete failed"),
-			},
-		})
-
-		count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background())
-		if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") {
-			t.Fatalf("expected delete failure, got count=%d err=%v", count, err)
-		}
-	})
-}
-
-// TestUnavailableNodeSetBranches runs one orchestration or CLI step.
-// Signature: TestUnavailableNodeSetBranches(t *testing.T).
-// Why: node Ready parsing drives dead-node cleanup, so malformed and missing
-// Ready condition payloads need direct coverage too.
-func TestUnavailableNodeSetBranches(t *testing.T) {
-	t.Run("decode error surfaces", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`},
-		})
-		if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") {
-			t.Fatalf("expected decode error, got %v", err)
-		}
-	})
-
-	t.Run("missing ready condition counts as unavailable", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "get nodes -o json"),
-				out:   `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`,
-			},
-		})
-		nodes, err := orch.unavailableNodeSet(context.Background())
-		if err != nil {
-			t.Fatalf("unavailableNodeSet failed: %v", err)
-		}
-		if _, ok := nodes["titan-22"]; !ok {
-			t.Fatalf("expected titan-22 to be treated as unavailable")
-		}
-		if _, ok := nodes["titan-07"]; ok {
-			t.Fatalf("did not expect titan-07 to be treated as unavailable")
-		}
-	})
-}
-
-// TestRequestFluxReconcileBranches runs one orchestration or CLI step.
-// Signature: TestRequestFluxReconcileBranches(t *testing.T).
-// Why: the post-start repair loop needs predictable Flux refresh behavior even
-// when one annotation call is flaky.
-func TestRequestFluxReconcileBranches(t *testing.T) {
-	t.Run("dry run skips", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
-		orch.runner.DryRun = true
-		if err := orch.requestFluxReconcile(context.Background()); err != nil {
-			t.Fatalf("dry-run requestFluxReconcile failed: %v", err)
-		}
-	})
-
-	t.Run("git source annotate error surfaces", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
-				err:   errors.New("annotate failed"),
-			},
-		})
-		if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") {
-			t.Fatalf("expected gitrepository annotate error, got %v", err)
-		}
-	})
-
-	t.Run("kustomization annotate error surfaces", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="),
-				out:   "",
-			},
-			{
-				match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="),
-				err:   errors.New("annotate failed"),
-			},
-		})
-		if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") {
-			t.Fatalf("expected kustomization annotate error, got %v", err)
-		}
-	})
-
-	t.Run("helm annotate warning and flux command path", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		callLog := filepath.Join(tmpDir, "calls.log")
-		kubectlPath := filepath.Join(tmpDir, "kubectl")
-		fluxPath := filepath.Join(tmpDir, "flux")
-
-		kubectlScript := "#!/bin/sh\n" +
-			"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
-			"case \"$*\" in\n" +
-			"  *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" +
-			"esac\n" +
-			"exit 0\n"
-		fluxScript := "#!/bin/sh\n" +
-			"printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
-			"exit 0\n"
-
-		if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
-			t.Fatalf("write fake kubectl: %v", err)
-		}
-		if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
-			t.Fatalf("write fake flux: %v", err)
-		}
-		t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
-
-		cfg := config.Config{
-			State: config.State{
-				Dir:            t.TempDir(),
-				ReportsDir:     filepath.Join(t.TempDir(), "reports"),
-				RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
-			},
-		}
-		orch := &Orchestrator{
-			cfg:    cfg,
-			runner: &execx.Runner{},
-			store:  state.New(cfg.State.RunHistoryPath),
-			log:    log.New(io.Discard, "", 0),
-		}
-
-		if err := orch.requestFluxReconcile(context.Background()); err != nil {
-			t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err)
-		}
-		calls, err := os.ReadFile(callLog)
-		if err != nil {
-			t.Fatalf("read fake command log: %v", err)
-		}
-		logText := string(calls)
-		if !strings.Contains(logText, "annotate gitrepository flux-system") {
-			t.Fatalf("expected gitrepository annotate call, got %q", logText)
-		}
-		if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") {
-			t.Fatalf("expected kustomization annotate call, got %q", logText)
-		}
-		if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") {
-			t.Fatalf("expected flux reconcile command, got %q", logText)
-		}
-	})
-
-	t.Run("flux command failure is tolerated", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		callLog := filepath.Join(tmpDir, "calls.log")
-		kubectlPath := filepath.Join(tmpDir, "kubectl")
-		fluxPath := filepath.Join(tmpDir, "flux")
-
-		kubectlScript := "#!/bin/sh\n" +
-			"printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" +
-			"exit 0\n"
-		fluxScript := "#!/bin/sh\n" +
-			"printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" +
-			"exit 1\n"
-
-		if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil {
-			t.Fatalf("write fake kubectl: %v", err)
-		}
-		if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil {
-			t.Fatalf("write fake flux: %v", err)
-		}
-		t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH"))
-
-		cfg := config.Config{
-			State: config.State{
-				Dir:            t.TempDir(),
-				ReportsDir:     filepath.Join(t.TempDir(), "reports"),
-				RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
-			},
-		}
-		orch := &Orchestrator{
-			cfg:    cfg,
-			runner: &execx.Runner{},
-			store:  state.New(cfg.State.RunHistoryPath),
-			log:    log.New(io.Discard, "", 0),
-		}
-
-		if err := orch.requestFluxReconcile(context.Background()); err != nil {
-			t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err)
-		}
-		calls, err := os.ReadFile(callLog)
-		if err != nil {
-			t.Fatalf("read fake command log: %v", err)
-		}
-		if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") {
-			t.Fatalf("expected failing flux command to be attempted, got %q", string(calls))
-		}
-	})
-}
--- a/internal/cluster/orchestrator_autorepair_test.go
+++ b/internal/cluster/orchestrator_autorepair_test.go
@ -1,382 +0,0 @@
-package cluster
-
-import (
-	"context"
-	"encoding/base64"
-	"errors"
-	"io"
-	"log"
-	"path/filepath"
-	"strings"
-	"testing"
-	"time"
-
-	"scm.bstein.dev/bstein/ananke/internal/config"
-	"scm.bstein.dev/bstein/ananke/internal/execx"
-	"scm.bstein.dev/bstein/ananke/internal/state"
-)
-
-// TestPostStartAutoHealRepairsVaultAndUnavailableNodes runs one orchestration or CLI step.
-// Signature: TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T).
-// Why: covers the new daemon-triggered repair path for late Vault reseals and
-// stale terminating pods anchored to unavailable nodes.
-func TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T) {
-	cfg := config.Config{
-		Startup: config.Startup{
-			DeadNodeCleanupGraceSeconds: 300,
-			RequiredNodeLabels: map[string]map[string]string{
-				"titan-07": {"node-role.kubernetes.io/worker": "true"},
-			},
-		},
-		State: config.State{
-			Dir:            t.TempDir(),
-			ReportsDir:     filepath.Join(t.TempDir(), "reports"),
-			RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
-		},
-	}
-	orch := &Orchestrator{
-		cfg:    cfg,
-		runner: &execx.Runner{},
-		store:  state.New(filepath.Join(t.TempDir(), "runs.json")),
-		log:    log.New(io.Discard, "", 0),
-	}
-
-	oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
-	unsealCalls := 0
-	jobCreated := false
-	reconciled := false
-	deleted := map[string]bool{}
-	dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
-		if name != "kubectl" {
-			return "", nil
-		}
-		joined := strings.Join(args, " ")
-		switch {
-		case strings.Contains(joined, "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"):
-			return "", nil
-		case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
-			return "Running", nil
-		case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
-			if unsealCalls == 0 {
-				return `{"initialized":true,"sealed":true}`, nil
-			}
-			return `{"initialized":true,"sealed":false}`, nil
-		case strings.Contains(joined, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"):
-			return base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), nil
-		case strings.Contains(joined, "vault operator unseal"):
-			unsealCalls++
-			return "", nil
-		case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
-			jobCreated = true
-			return "", nil
-		case strings.Contains(joined, "get nodes -o json"):
-			return `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
-		case strings.Contains(joined, "get pods -A -o json"):
-			return `{"items":[{"metadata":{"namespace":"maintenance","name":"stale-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},{"metadata":{"namespace":"logging","name":"healthy-node-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}}]}`, nil
-		case strings.Contains(joined, "-n maintenance delete pod stale-pod --grace-period=0 --force --wait=false"):
-			deleted["maintenance/stale-pod"] = true
-			return "", nil
-		case strings.Contains(joined, "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="):
-			reconciled = true
-			return "", nil
-		case strings.Contains(joined, "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
-			return "", nil
-		case strings.Contains(joined, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="):
-			return "", nil
-		default:
-			return "", nil
-		}
-	}
-	orch.SetCommandOverrides(dispatch, dispatch)
-
-	if err := orch.postStartAutoHeal(context.Background()); err != nil {
-		t.Fatalf("postStartAutoHeal failed: %v", err)
-	}
-	if unsealCalls != 1 {
-		t.Fatalf("expected one Vault unseal attempt, got %d", unsealCalls)
-	}
-	if !jobCreated {
-		t.Fatalf("expected vault k8s auth config job to be created")
-	}
-	if !deleted["maintenance/stale-pod"] {
-		t.Fatalf("expected stale unavailable-node pod to be deleted")
-	}
-	if !reconciled {
-		t.Fatalf("expected flux reconcile request after repairs")
-	}
-	if deleted["logging/healthy-node-pod"] {
-		t.Fatalf("did not expect terminating pod on healthy node to be deleted")
-	}
-}
-
-// TestPostStartAutoHealSkipsWhenClusterIsHealthy runs one orchestration or CLI step.
-// Signature: TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T).
-// Why: proves the new post-start repair loop stays quiet when the specific
-// failure patterns are absent.
-func TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T) {
-	cfg := config.Config{
-		Startup: config.Startup{
-			DeadNodeCleanupGraceSeconds: 300,
-		},
-		State: config.State{
-			Dir:            t.TempDir(),
-			ReportsDir:     filepath.Join(t.TempDir(), "reports"),
-			RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"),
-		},
-	}
-	orch := &Orchestrator{
-		cfg:    cfg,
-		runner: &execx.Runner{},
-		store:  state.New(filepath.Join(t.TempDir(), "runs.json")),
-		log:    log.New(io.Discard, "", 0),
-	}
-
-	unsealCalls := 0
-	jobCreated := false
-	reconciled := false
-	dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
-		if name != "kubectl" {
-			return "", nil
-		}
-		joined := strings.Join(args, " ")
-		switch {
-		case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
-			return "Running", nil
-		case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"):
-			return `{"initialized":true,"sealed":false}`, nil
-		case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"):
-			jobCreated = true
-			return "", nil
-		case strings.Contains(joined, "vault operator unseal"):
-			unsealCalls++
-			return "", nil
-		case strings.Contains(joined, "get nodes -o json"):
-			return `{"items":[{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil
-		case strings.Contains(joined, "get pods -A -o json"):
-			return `{"items":[]}`, nil
-		case strings.Contains(joined, "reconcile.fluxcd.io/requestedAt="):
-			reconciled = true
-			return "", nil
-		default:
-			return "", nil
-		}
-	}
-	orch.SetCommandOverrides(dispatch, dispatch)
-
-	if err := orch.postStartAutoHeal(context.Background()); err != nil {
-		t.Fatalf("postStartAutoHeal failed: %v", err)
-	}
-	if unsealCalls != 0 {
-		t.Fatalf("did not expect Vault unseal calls, got %d", unsealCalls)
-	}
-	if jobCreated {
-		t.Fatalf("did not expect vault auth config job creation")
-	}
-	if reconciled {
-		t.Fatalf("did not expect flux reconcile request for healthy cluster")
-	}
-}
-
-// TestRunPostStartAutoHealDryRun runs one orchestration or CLI step.
-// Signature: TestRunPostStartAutoHealDryRun(t *testing.T).
-// Why: covers the exported wrapper and the top-level dry-run guard so daemon
-// auto-heal never mutates cluster state during rehearsal runs.
-func TestRunPostStartAutoHealDryRun(t *testing.T) {
-	orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
-	orch.runner.DryRun = true
-
-	if err := orch.RunPostStartAutoHeal(context.Background()); err != nil {
-		t.Fatalf("RunPostStartAutoHeal dry-run failed: %v", err)
-	}
-}
-
-// TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step.
-// Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T).
-// Why: proves the daemon reports each failed sub-repair together instead of
-// hiding later failures behind the first problem.
-func TestPostStartAutoHealAggregatesErrors(t *testing.T) {
-	cfg := config.Config{
-		Startup: config.Startup{
-			DeadNodeCleanupGraceSeconds: 300,
-			RequiredNodeLabels: map[string]map[string]string{
-				"titan-07": {"node-role.kubernetes.io/worker": "true"},
-			},
-		},
-	}
-	orch := buildOrchestratorWithStubs(t, cfg, []commandStub{
-		{
-			match: matchContains("kubectl", "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"),
-			err:   errors.New("label failed"),
-		},
-		{
-			match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
-			err:   errors.New("vault phase failed"),
-		},
-		{
-			match: matchContains("kubectl", "get nodes -o json"),
-			err:   errors.New("node query failed"),
-		},
-	})
-
-	err := orch.postStartAutoHeal(context.Background())
-	if err == nil {
-		t.Fatalf("expected aggregated error")
-	}
-	msg := err.Error()
-	for _, want := range []string{
-		"required node labels:",
-		"vault auto-recovery:",
-		"dead-node terminating pod cleanup:",
-	} {
-		if !strings.Contains(msg, want) {
-			t.Fatalf("expected %q in %q", want, msg)
-		}
-	}
-}
-
-// TestAutoRecoverSealedVaultBranches runs one orchestration or CLI step.
-// Signature: TestAutoRecoverSealedVaultBranches(t *testing.T).
-// Why: late Vault reseals are a high-risk failure path, so the daemon needs
-// coverage across the quiet-skip, parse-failure, and unseal-failure branches.
-func TestAutoRecoverSealedVaultBranches(t *testing.T) {
-	t.Run("dry run skips", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
-		orch.runner.DryRun = true
-
-		recovered, err := orch.autoRecoverSealedVault(context.Background())
-		if err != nil || recovered {
-			t.Fatalf("expected dry-run skip, got recovered=%v err=%v", recovered, err)
-		}
-	})
-
-	t.Run("pod missing is quiet", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
-				err:   errors.New("vault-0 not found"),
-			},
-		})
-
-		recovered, err := orch.autoRecoverSealedVault(context.Background())
-		if err != nil || recovered {
-			t.Fatalf("expected quiet skip, got recovered=%v err=%v", recovered, err)
-		}
-	})
-
-	t.Run("phase check error surfaces", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
-				err:   errors.New("phase check failed"),
-			},
-		})
-
-		recovered, err := orch.autoRecoverSealedVault(context.Background())
-		if recovered || err == nil || !strings.Contains(err.Error(), "vault pod phase check failed") {
-			t.Fatalf("expected phase check error, got recovered=%v err=%v", recovered, err)
-		}
-	})
-
-	t.Run("non-running pod defers", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
-				out:   "Pending",
-			},
-		})
-
-		recovered, err := orch.autoRecoverSealedVault(context.Background())
-		if err != nil || recovered {
-			t.Fatalf("expected pending pod skip, got recovered=%v err=%v", recovered, err)
-		}
-	})
-
-	t.Run("status parse failure surfaces", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
-				out:   "Running",
-			},
-			{
-				match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
-				out:   "garbage",
-			},
-		})
-
-		recovered, err := orch.autoRecoverSealedVault(context.Background())
-		if recovered || err == nil || !strings.Contains(err.Error(), "parse vault status") {
-			t.Fatalf("expected parse error, got recovered=%v err=%v", recovered, err)
-		}
-	})
-
-	t.Run("already unsealed stays quiet", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
-				out:   "Running",
-			},
-			{
-				match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
-				out:   `{"sealed":false}`,
-			},
-		})
-
-		recovered, err := orch.autoRecoverSealedVault(context.Background())
-		if err != nil || recovered {
-			t.Fatalf("expected already-unsealed skip, got recovered=%v err=%v", recovered, err)
-		}
-	})
-
-	t.Run("unseal failure surfaces", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"),
-				out:   "Running",
-			},
-			{
-				match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"),
-				out:   `{"sealed":true}`,
-			},
-			{
-				match: matchContains("kubectl", "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"),
-				out:   base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")),
-			},
-			{
-				match: matchContains("kubectl", "vault operator unseal"),
-				err:   errors.New("exec boom"),
-			},
-		})
-
-		recovered, err := orch.autoRecoverSealedVault(context.Background())
-		if recovered || err == nil || !strings.Contains(err.Error(), "vault unseal attempt 1 failed") {
-			t.Fatalf("expected unseal failure, got recovered=%v err=%v", recovered, err)
-		}
-	})
-}
-
-// TestRerunVaultK8sAuthConfigJobBranches runs one orchestration or CLI step.
-// Signature: TestRerunVaultK8sAuthConfigJobBranches(t *testing.T).
-// Why: the post-unseal auth job is part of the production recovery chain, so
-// dry-run and create-error behavior both need explicit coverage.
-func TestRerunVaultK8sAuthConfigJobBranches(t *testing.T) {
-	t.Run("dry run skips", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, nil)
-		orch.runner.DryRun = true
-		if err := orch.rerunVaultK8sAuthConfigJob(context.Background()); err != nil {
-			t.Fatalf("dry-run rerunVaultK8sAuthConfigJob failed: %v", err)
-		}
-	})
-
-	t.Run("create error surfaces", func(t *testing.T) {
-		orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{
-			{
-				match: matchContains("kubectl", "-n vault create job --from=cronjob/vault-k8s-auth-config"),
-				err:   errors.New("create failed"),
-			},
-		})
-		err := orch.rerunVaultK8sAuthConfigJob(context.Background())
-		if err == nil || !strings.Contains(err.Error(), "create job vault-k8s-auth-config-autoheal-") {
-			t.Fatalf("expected create-job error, got %v", err)
-		}
-	})
-}
--- a/internal/cluster/orchestrator_critical_vault.go
+++ b/internal/cluster/orchestrator_critical_vault.go
@ -227,31 +227,6 @@ func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) er
 	return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
 }

-// ensureVaultUnsealedWhenRunnable runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error).
-// Why: lets startup defer vault unseal until the pod is actually runnable, while
-// keeping the direct unseal helper strict for explicit recovery paths and tests.
-func (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error) {
-	if o.runner.DryRun {
-		return false, "", nil
-	}
-
-	phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
-	if err != nil {
-		if isNotFoundErr(err) {
-			return true, "vault-0 pod is not present yet; deferring unseal until critical workload recovery", nil
-		}
-		return false, "", fmt.Errorf("vault pod phase check failed: %w", err)
-	}
-
-	trimmedPhase := strings.TrimSpace(phase)
-	if trimmedPhase != "Running" {
-		return true, fmt.Sprintf("vault-0 pod phase is %q; deferring unseal until critical workload recovery", trimmedPhase), nil
-	}
-
-	return false, "", o.ensureVaultUnsealed(ctx)
-}
-
 // ensureVaultUnsealed runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
--- a/internal/cluster/orchestrator_fluxhealth.go
+++ b/internal/cluster/orchestrator_fluxhealth.go
@ -143,8 +143,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
 		return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
 	}
 	ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
-	required := o.startupRequiredFluxKustomizations()
-	requiredSeen := map[string]struct{}{}
 	notReady := []string{}
 	for _, ks := range list.Items {
 		ns := strings.TrimSpace(ks.Metadata.Namespace)
@ -156,12 +154,6 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
 		if ks.Spec.Suspend {
 			continue
 		}
-		if len(required) > 0 {
-			if _, ok := required[full]; !ok {
-				continue
-			}
-			requiredSeen[full] = struct{}{}
-		}
 		if _, ok := ignored[full]; ok {
 			continue
 		}
@ -181,25 +173,10 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
 		}
 		notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
 	}
-	if len(required) > 0 {
-		missing := []string{}
-		for full := range required {
-			if _, ok := requiredSeen[full]; !ok {
-				missing = append(missing, full+"(missing)")
-			}
-		}
-		if len(missing) > 0 {
-			sort.Strings(missing)
-			notReady = append(notReady, missing...)
-		}
-	}
 	if len(notReady) > 0 {
 		sort.Strings(notReady)
 		return false, "not ready: " + joinLimited(notReady, 6), nil
 	}
-	if len(required) > 0 {
-		return true, fmt.Sprintf("required kustomizations ready=%d", len(requiredSeen)), nil
-	}
 	return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
 }

--- a/internal/cluster/orchestrator_ingress.go
+++ b/internal/cluster/orchestrator_ingress.go
@ -19,7 +19,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
 	if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
 		return nil
 	}
-	ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
 	for node := range o.cfg.Startup.RequiredNodeLabels {
 		node = strings.TrimSpace(node)
@ -29,10 +28,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
 	}
 	sort.Strings(nodes)
 	for _, node := range nodes {
-		if _, skip := ignored[node]; skip {
-			o.log.Printf("skipping required node labels for ignored unavailable node %s", node)
-			continue
-		}
 		labels := o.cfg.Startup.RequiredNodeLabels[node]
 		if len(labels) == 0 {
 			continue
@ -60,11 +55,6 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
 			continue
 		}
 		if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
-			if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {
-				o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)
-				o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))
-				continue
-			}
 			return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
 		}
 		o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))
--- a/internal/cluster/orchestrator_lifecycle.go
+++ b/internal/cluster/orchestrator_lifecycle.go
@ -37,7 +37,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
 		return invErr
 	}
 	o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
-	o.maybeRunEarlyVaultUnseal(ctx)
 	o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
 	if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
 		o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
@ -180,9 +179,6 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
 		}
 	}
 	o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
-	if err := o.runStartupVaultUnsealGate(ctx); err != nil {
-		return err
-	}
 	if err := o.ensureRequiredNodeLabels(ctx); err != nil {
 		return err
 	}
@ -480,3 +476,18 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
 	o.log.Printf("shutdown flow complete")
 	return nil
 }
+
+// normalizeShutdownMode runs one orchestration or CLI step.
+// Signature: normalizeShutdownMode(raw string) (string, error).
+// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
+// semantics while preserving compatibility with legacy "config" callers.
+func normalizeShutdownMode(raw string) (string, error) {
+	switch strings.TrimSpace(raw) {
+	case "", "config", "cluster-only":
+		return "cluster-only", nil
+	case "poweroff":
+		return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
+	default:
+		return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
+	}
+}
--- a/internal/cluster/orchestrator_node_reachability.go
+++ b/internal/cluster/orchestrator_node_reachability.go
@ -30,7 +30,7 @@ func (o *Orchestrator) waitForNodeInventoryReachability(ctx context.Context) err
 	ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	targets := make([]string, 0, len(o.inventoryNodesForValidation()))
 	seen := map[string]struct{}{}
-	for _, node := range startupRequiredNodes(o.inventoryNodesForValidation(), o.cfg.Startup.NodeInventoryReachRequiredNodes) {
+	for _, node := range o.inventoryNodesForValidation() {
 		node = strings.TrimSpace(node)
 		if node == "" {
 			continue
--- a/internal/cluster/orchestrator_scheduling_storm.go
+++ b/internal/cluster/orchestrator_scheduling_storm.go
@ -1,261 +0,0 @@
-package cluster
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"sort"
-	"strings"
-	"time"
-)
-
-// maybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
-// Why: a non-core workload that cannot schedule can emit enough warning events to
-// thrash the control plane datastore; quarantine keeps startup moving while
-// preserving core services.
-func (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
-	if o.runner.DryRun || !o.cfg.Startup.AutoQuarantineSchedulingStorms {
-		return
-	}
-	now := time.Now()
-	if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second {
-		return
-	}
-	if lastAttempt != nil {
-		*lastAttempt = now
-	}
-	o.bestEffort("quarantine non-core scheduling storm workloads", func() error {
-		return o.quarantineSchedulingStormWorkloads(ctx)
-	})
-}
-
-// quarantineSchedulingStormWorkloads runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error.
-// Why: limits startup-only mitigation to workloads proven to be generating a
-// scheduling event storm, instead of scaling optional apps down blindly.
-func (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error {
-	podsOut, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
-	if err != nil {
-		return fmt.Errorf("query pods for scheduling storm scan: %w", err)
-	}
-	var pods podList
-	if err := json.Unmarshal([]byte(podsOut), &pods); err != nil {
-		return fmt.Errorf("decode pods for scheduling storm scan: %w", err)
-	}
-
-	rsOut, err := o.kubectl(ctx, 30*time.Second, "get", "replicasets", "-A", "-o", "json")
-	if err != nil {
-		return fmt.Errorf("query replicasets for scheduling storm scan: %w", err)
-	}
-	var rsList replicaSetList
-	if err := json.Unmarshal([]byte(rsOut), &rsList); err != nil {
-		return fmt.Errorf("decode replicasets for scheduling storm scan: %w", err)
-	}
-
-	eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
-	if err != nil {
-		return fmt.Errorf("query events for scheduling storm scan: %w", err)
-	}
-	var events eventList
-	if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
-		return fmt.Errorf("decode events for scheduling storm scan: %w", err)
-	}
-
-	workloadsOut, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
-	if err != nil {
-		return fmt.Errorf("query workloads for scheduling storm scan: %w", err)
-	}
-	var workloads workloadList
-	if err := json.Unmarshal([]byte(workloadsOut), &workloads); err != nil {
-		return fmt.Errorf("decode workloads for scheduling storm scan: %w", err)
-	}
-
-	requiredNamespaces := o.startupRequiredWorkloadNamespaces()
-	ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
-	ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
-	ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
-	eventThreshold := o.cfg.Startup.SchedulingStormEventThreshold
-	if eventThreshold <= 0 {
-		eventThreshold = 30
-	}
-	window := time.Duration(o.cfg.Startup.SchedulingStormWindowSeconds) * time.Second
-	if window <= 0 {
-		window = 3 * time.Minute
-	}
-
-	podsByKey := map[string]podResource{}
-	for _, pod := range pods.Items {
-		ns := strings.TrimSpace(pod.Metadata.Namespace)
-		name := strings.TrimSpace(pod.Metadata.Name)
-		if ns == "" || name == "" {
-			continue
-		}
-		podsByKey[ns+"/"+name] = pod
-	}
-
-	rsOwners := map[string]ownerReference{}
-	for _, rs := range rsList.Items {
-		ns := strings.TrimSpace(rs.Metadata.Namespace)
-		name := strings.TrimSpace(rs.Metadata.Name)
-		if ns == "" || name == "" {
-			continue
-		}
-		for _, owner := range rs.Metadata.OwnerReferences {
-			kind := strings.TrimSpace(owner.Kind)
-			ownerName := strings.TrimSpace(owner.Name)
-			if kind == "" || ownerName == "" {
-				continue
-			}
-			rsOwners[ns+"/"+name] = owner
-			break
-		}
-	}
-
-	workloadDesired := map[string]int32{}
-	for _, item := range workloads.Items {
-		kind := strings.ToLower(strings.TrimSpace(item.Kind))
-		ns := strings.TrimSpace(item.Metadata.Namespace)
-		name := strings.TrimSpace(item.Metadata.Name)
-		if kind == "" || ns == "" || name == "" {
-			continue
-		}
-		desired, _, ok := desiredReady(item)
-		if !ok {
-			continue
-		}
-		workloadDesired[ns+"/"+kind+"/"+name] = desired
-	}
-
-	quarantined := []string{}
-	seen := map[string]struct{}{}
-	now := time.Now()
-	for _, event := range events.Items {
-		if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
-			continue
-		}
-		if strings.TrimSpace(event.Reason) != "FailedScheduling" {
-			continue
-		}
-		if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
-			continue
-		}
-		lastSeen := eventLastObservedAt(event)
-		if !lastSeen.IsZero() && now.Sub(lastSeen) > window {
-			continue
-		}
-		count := eventObservationCount(event)
-		if count < eventThreshold {
-			continue
-		}
-		podKey := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
-		pod, ok := podsByKey[podKey]
-		if !ok {
-			continue
-		}
-		if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
-			continue
-		}
-		ns := strings.TrimSpace(pod.Metadata.Namespace)
-		if _, ok := requiredNamespaces[ns]; ok {
-			continue
-		}
-		if _, ok := ignoredNamespaces[ns]; ok {
-			continue
-		}
-		if workloadIgnored(ignoreRules, ns, "", strings.TrimSpace(pod.Metadata.Name)) {
-			continue
-		}
-		if podTargetsIgnoredNode(pod, ignoredNodes) {
-			continue
-		}
-		workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
-		if !ok {
-			continue
-		}
-		if workloadIgnored(ignoreRules, workload.Namespace, workload.Kind, workload.Name) {
-			continue
-		}
-		workloadKey := workload.Namespace + "/" + workload.Kind + "/" + workload.Name
-		if _, done := seen[workloadKey]; done {
-			continue
-		}
-		desired := workloadDesired[workloadKey]
-		if desired <= 0 {
-			continue
-		}
-		if err := o.ensureWorkloadReplicas(ctx, workload, 0); err != nil {
-			return fmt.Errorf("scale scheduling storm workload %s to 0: %w", workloadKey, err)
-		}
-		seen[workloadKey] = struct{}{}
-		quarantined = append(quarantined, fmt.Sprintf("%s events=%d window=%s", workloadKey, count, window))
-	}
-
-	if len(quarantined) == 0 {
-		return nil
-	}
-	sort.Strings(quarantined)
-	detail := fmt.Sprintf("quarantined scheduling storm workload(s): %s", joinLimited(quarantined, 8))
-	o.log.Printf("%s", detail)
-	o.noteStartupAutoHeal(detail)
-	return nil
-}
-
-// schedulingStormOwnerWorkload runs one orchestration or CLI step.
-// Signature: schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool).
-// Why: scheduling storms happen at the pod layer, but safe mitigation needs to
-// operate on the owning deployment or statefulset.
-func schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool) {
-	ns := strings.TrimSpace(pod.Metadata.Namespace)
-	for _, owner := range pod.Metadata.OwnerReferences {
-		switch strings.TrimSpace(owner.Kind) {
-		case "StatefulSet":
-			if name := strings.TrimSpace(owner.Name); name != "" {
-				return startupWorkload{Namespace: ns, Kind: "statefulset", Name: name}, true
-			}
-		case "ReplicaSet":
-			rsName := strings.TrimSpace(owner.Name)
-			if rsName == "" {
-				continue
-			}
-			rsOwner, ok := rsOwners[ns+"/"+rsName]
-			if !ok || strings.TrimSpace(rsOwner.Kind) != "Deployment" || strings.TrimSpace(rsOwner.Name) == "" {
-				continue
-			}
-			return startupWorkload{Namespace: ns, Kind: "deployment", Name: strings.TrimSpace(rsOwner.Name)}, true
-		}
-	}
-	return startupWorkload{}, false
-}
-
-// eventObservationCount runs one orchestration or CLI step.
-// Signature: eventObservationCount(event eventResource) int.
-// Why: event count can live either on the root event or in the series payload;
-// using the max keeps detection stable across Kubernetes versions.
-func eventObservationCount(event eventResource) int {
-	count := event.Count
-	if event.Series.Count > count {
-		count = event.Series.Count
-	}
-	if count < 1 {
-		return 1
-	}
-	return count
-}
-
-// eventLastObservedAt runs one orchestration or CLI step.
-// Signature: eventLastObservedAt(event eventResource) time.Time.
-// Why: event recency fields vary by cluster version; prefer the newest explicit
-// observation time and fall back to creation time when needed.
-func eventLastObservedAt(event eventResource) time.Time {
-	switch {
-	case !event.Series.LastObservedTime.IsZero():
-		return event.Series.LastObservedTime
-	case !event.LastTimestamp.IsZero():
-		return event.LastTimestamp
-	case !event.EventTime.IsZero():
-		return event.EventTime
-	default:
-		return event.Metadata.CreationTimestamp
-	}
-}
--- a/internal/cluster/orchestrator_shutdown_mode.go
+++ b/internal/cluster/orchestrator_shutdown_mode.go
@ -1,21 +0,0 @@
-package cluster
-
-import (
-	"fmt"
-	"strings"
-)
-
-// normalizeShutdownMode runs one orchestration or CLI step.
-// Signature: normalizeShutdownMode(raw string) (string, error).
-// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
-// semantics while preserving compatibility with legacy "config" callers.
-func normalizeShutdownMode(raw string) (string, error) {
-	switch strings.TrimSpace(raw) {
-	case "", "config", "cluster-only":
-		return "cluster-only", nil
-	case "poweroff":
-		return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
-	default:
-		return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
-	}
-}
--- a/internal/cluster/orchestrator_startup_scope.go
+++ b/internal/cluster/orchestrator_startup_scope.go
@ -1,81 +0,0 @@
-package cluster
-
-import "strings"
-
-// startupRequiredNodes runs one orchestration or CLI step.
-// Signature: startupRequiredNodes(nodes []string, required []string) []string.
-// Why: lets startup enforce a smaller core node set during outage recovery
-// without losing the stricter all-nodes behavior when no override is configured.
-func startupRequiredNodes(nodes []string, required []string) []string {
-	requiredSet := makeStringSet(required)
-	if len(requiredSet) == 0 {
-		return nodes
-	}
-	filtered := make([]string, 0, len(nodes))
-	for _, node := range nodes {
-		node = strings.TrimSpace(node)
-		if node == "" {
-			continue
-		}
-		if _, ok := requiredSet[node]; ok {
-			filtered = append(filtered, node)
-		}
-	}
-	return filtered
-}
-
-// startupNodeStrictlyRequired runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) startupNodeStrictlyRequired(node string) bool.
-// Why: absent or broken non-core nodes should not block recovery-only actions
-// like label reconciliation once the operator has narrowed startup to core nodes.
-func (o *Orchestrator) startupNodeStrictlyRequired(node string) bool {
-	node = strings.TrimSpace(node)
-	if node == "" {
-		return false
-	}
-	if len(o.cfg.Startup.NodeInventoryReachRequiredNodes) == 0 && len(o.cfg.Startup.NodeSSHAuthRequiredNodes) == 0 {
-		return true
-	}
-	for _, controlPlane := range o.cfg.ControlPlanes {
-		if strings.TrimSpace(controlPlane) == node {
-			return true
-		}
-	}
-	if containsNode(o.cfg.Startup.NodeInventoryReachRequiredNodes, node) {
-		return true
-	}
-	return containsNode(o.cfg.Startup.NodeSSHAuthRequiredNodes, node)
-}
-
-// startupRequiredFluxKustomizations runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{}.
-// Why: lets outage recovery wait on a declared core GitOps slice while leaving
-// optional stacks free to converge after bootstrap succeeds.
-func (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{} {
-	return makeStringSet(o.cfg.Startup.FluxHealthRequiredKustomizations)
-}
-
-// startupRequiredWorkloadNamespaces runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{}.
-// Why: keeps workload readiness scoped to core namespaces during recovery while
-// preserving broad convergence checks when no explicit core list is configured.
-func (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{} {
-	return makeStringSet(o.cfg.Startup.WorkloadConvergenceRequiredNamespaces)
-}
-
-// containsNode runs one orchestration or CLI step.
-// Signature: containsNode(entries []string, needle string) bool.
-// Why: keeps node-scope checks small and explicit anywhere startup narrows its
-// recovery gates to a declared core set.
-func containsNode(entries []string, needle string) bool {
-	needle = strings.TrimSpace(needle)
-	if needle == "" {
-		return false
-	}
-	for _, entry := range entries {
-		if strings.TrimSpace(entry) == needle {
-			return true
-		}
-	}
-	return false
-}
--- a/internal/cluster/orchestrator_startup_vault.go
+++ b/internal/cluster/orchestrator_startup_vault.go
@ -1,52 +0,0 @@
-package cluster
-
-import (
-	"context"
-	"fmt"
-	"time"
-)
-
-// maybeRunEarlyVaultUnseal runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context).
-// Why: gives startup a best-effort Vault recovery path when the API is already
-// live, without consuming the hard startup failure path before workloads recover.
-func (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context) {
-	if err := o.waitForAPI(ctx, 1, time.Second); err != nil {
-		return
-	}
-
-	o.noteStartupCheckState("vault-unseal-early", "running", "best-effort early vault unseal while kubernetes api is already available")
-	deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
-	if err != nil {
-		o.log.Printf("warning: early vault unseal deferred: %v", err)
-		o.noteStartupAutoHeal(fmt.Sprintf("deferred early vault unseal: %v", err))
-		return
-	}
-	if deferred {
-		o.log.Printf("vault early unseal deferred: %s", detail)
-		o.noteStartupAutoHeal(detail)
-		return
-	}
-	o.noteStartupCheck("vault-unseal-early", true, "vault is already unsealed")
-}
-
-// runStartupVaultUnsealGate runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error.
-// Why: keeps the top-level startup flow readable while allowing Vault unseal to
-// defer cleanly until critical workload recovery when the pod is not runnable yet.
-func (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error {
-	o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
-	deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
-	if err != nil {
-		o.noteStartupCheck("vault-unseal", false, err.Error())
-		return err
-	}
-	if deferred {
-		o.log.Printf("vault unseal deferred until workload recovery: %s", detail)
-		o.noteStartupAutoHeal(detail)
-		o.noteStartupCheck("vault-unseal", true, detail)
-		return nil
-	}
-	o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
-	return nil
-}
--- a/internal/cluster/orchestrator_storage_types.go
+++ b/internal/cluster/orchestrator_storage_types.go
@ -177,46 +177,6 @@ type jobConditionRef struct {
 	Status string `json:"status"`
 }

-type eventList struct {
-	Items []eventResource `json:"items"`
-}
-
-type eventResource struct {
-	Metadata struct {
-		Namespace         string    `json:"namespace"`
-		CreationTimestamp time.Time `json:"creationTimestamp"`
-	} `json:"metadata"`
-	InvolvedObject struct {
-		Kind      string `json:"kind"`
-		Namespace string `json:"namespace"`
-		Name      string `json:"name"`
-	} `json:"involvedObject"`
-	Type          string      `json:"type"`
-	Reason        string      `json:"reason"`
-	Message       string      `json:"message"`
-	Count         int         `json:"count"`
-	EventTime     time.Time   `json:"eventTime"`
-	LastTimestamp time.Time   `json:"lastTimestamp"`
-	Series        eventSeries `json:"series"`
-}
-
-type eventSeries struct {
-	Count            int       `json:"count"`
-	LastObservedTime time.Time `json:"lastObservedTime"`
-}
-
-type replicaSetList struct {
-	Items []replicaSetResource `json:"items"`
-}
-
-type replicaSetResource struct {
-	Metadata struct {
-		Namespace       string           `json:"namespace"`
-		Name            string           `json:"name"`
-		OwnerReferences []ownerReference `json:"ownerReferences"`
-	} `json:"metadata"`
-}
-
 type workloadResource struct {
 	Kind     string `json:"kind"`
 	Metadata struct {
@ -261,7 +221,6 @@ type podResource struct {

 type ownerReference struct {
 	Kind string `json:"kind"`
-	Name string `json:"name"`
 }

 type podContainerStatus struct {
--- a/internal/cluster/orchestrator_workload_convergence.go
+++ b/internal/cluster/orchestrator_workload_convergence.go
@ -26,12 +26,10 @@ func (o *Orchestrator) waitForWorkloadConvergence(ctx context.Context) error {
 	lastLogged := time.Time{}
 	lastRecycleAttempt := time.Time{}
 	lastReplicaHeal := time.Time{}
-	lastSchedulingStormHeal := time.Time{}
 	for {
 		prevFailure := lastFailure
 		o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
 		o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
-		o.maybeAutoQuarantineSchedulingStorms(ctx, &lastSchedulingStormHeal)
 		ready, detail, err := o.workloadConvergenceReady(ctx)
 		if err != nil {
 			lastFailure = err.Error()
@ -73,7 +71,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
 	if err := json.Unmarshal([]byte(out), &list); err != nil {
 		return false, "", fmt.Errorf("decode controllers: %w", err)
 	}
-	requiredNamespaces := o.startupRequiredWorkloadNamespaces()
 	ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
 	ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
@ -87,11 +84,6 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
 		if kind == "" || ns == "" || name == "" {
 			continue
 		}
-		if len(requiredNamespaces) > 0 {
-			if _, ok := requiredNamespaces[ns]; !ok {
-				continue
-			}
-		}
 		if _, ok := ignoredNamespaces[ns]; ok {
 			continue
 		}
--- a/internal/cluster/orchestrator_workload_ignore.go
+++ b/internal/cluster/orchestrator_workload_ignore.go
@ -116,7 +116,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
 		return nil, fmt.Errorf("decode pods: %w", err)
 	}

-	requiredNamespaces := o.startupRequiredWorkloadNamespaces()
 	ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
 	ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	stuckReasons := map[string]struct{}{
@ -139,11 +138,6 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
 		if ns == "" || name == "" {
 			continue
 		}
-		if len(requiredNamespaces) > 0 {
-			if _, ok := requiredNamespaces[ns]; !ok {
-				continue
-			}
-		}
 		if _, ok := ignoredNamespaces[ns]; ok {
 			continue
 		}
--- a/internal/cluster/testing_hooks_scheduling_storm.go
+++ b/internal/cluster/testing_hooks_scheduling_storm.go
@ -1,88 +0,0 @@
-package cluster
-
-import (
-	"context"
-	"fmt"
-	"strings"
-	"time"
-)
-
-// TestHookMaybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
-// Why: exposes the scheduling-storm trigger guard to the split top-level test module.
-func (o *Orchestrator) TestHookMaybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
-	o.maybeAutoQuarantineSchedulingStorms(ctx, lastAttempt)
-}
-
-// TestHookQuarantineSchedulingStormWorkloads runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error.
-// Why: exposes the scheduling-storm auto-heal body to the split top-level test module.
-func (o *Orchestrator) TestHookQuarantineSchedulingStormWorkloads(ctx context.Context) error {
-	return o.quarantineSchedulingStormWorkloads(ctx)
-}
-
-// TestHookSchedulingStormOwnerWorkload runs one orchestration or CLI step.
-// Signature: TestHookSchedulingStormOwnerWorkload(namespace string, ownerKind string, ownerName string, rsOwnerKind string, rsOwnerName string) (string, bool).
-// Why: exposes owner-resolution behavior without leaking internal workload types.
-func TestHookSchedulingStormOwnerWorkload(
-	namespace string,
-	ownerKind string,
-	ownerName string,
-	rsOwnerKind string,
-	rsOwnerName string,
-) (string, bool) {
-	var pod podResource
-	pod.Metadata.Namespace = strings.TrimSpace(namespace)
-	pod.Metadata.OwnerReferences = []ownerReference{{
-		Kind: strings.TrimSpace(ownerKind),
-		Name: strings.TrimSpace(ownerName),
-	}}
-	rsOwners := map[string]ownerReference{}
-	if rsName := strings.TrimSpace(ownerName); rsName != "" && strings.TrimSpace(ownerKind) == "ReplicaSet" {
-		rsOwners[pod.Metadata.Namespace+"/"+rsName] = ownerReference{
-			Kind: strings.TrimSpace(rsOwnerKind),
-			Name: strings.TrimSpace(rsOwnerName),
-		}
-	}
-	workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
-	if !ok {
-		return "", false
-	}
-	return fmt.Sprintf("%s/%s/%s", workload.Namespace, workload.Kind, workload.Name), true
-}
-
-// TestHookEventObservationCount runs one orchestration or CLI step.
-// Signature: TestHookEventObservationCount(count int, seriesCount int) int.
-// Why: exposes event-count normalization used by scheduling-storm detection.
-func TestHookEventObservationCount(count int, seriesCount int) int {
-	return eventObservationCount(eventResource{
-		Count: count,
-		Series: eventSeries{
-			Count: seriesCount,
-		},
-	})
-}
-
-// TestHookEventLastObservedAt runs one orchestration or CLI step.
-// Signature: TestHookEventLastObservedAt(seriesLastObserved time.Time, lastTimestamp time.Time, eventTime time.Time, creationTimestamp time.Time) time.Time.
-// Why: exposes event-time fallback behavior used by scheduling-storm detection.
-func TestHookEventLastObservedAt(
-	seriesLastObserved time.Time,
-	lastTimestamp time.Time,
-	eventTime time.Time,
-	creationTimestamp time.Time,
-) time.Time {
-	return eventLastObservedAt(eventResource{
-		LastTimestamp: lastTimestamp,
-		EventTime:     eventTime,
-		Series: eventSeries{
-			LastObservedTime: seriesLastObserved,
-		},
-		Metadata: struct {
-			Namespace         string    `json:"namespace"`
-			CreationTimestamp time.Time `json:"creationTimestamp"`
-		}{
-			CreationTimestamp: creationTimestamp,
-		},
-	})
-}
--- a/internal/cluster/testing_hooks_startup.go
+++ b/internal/cluster/testing_hooks_startup.go
@ -1,55 +0,0 @@
-package cluster
-
-import "context"
-
-// TestHookStartupRequiredNodes runs one orchestration or CLI step.
-// Signature: TestHookStartupRequiredNodes(nodes []string, required []string) []string.
-// Why: exposes recovery-scope node filtering so top-level tests can cover core-only startup narrowing.
-func TestHookStartupRequiredNodes(nodes []string, required []string) []string {
-	return startupRequiredNodes(nodes, required)
-}
-
-// TestHookContainsNode runs one orchestration or CLI step.
-// Signature: TestHookContainsNode(entries []string, needle string) bool.
-// Why: exposes the small startup-scope membership helper to top-level tests.
-func TestHookContainsNode(entries []string, needle string) bool {
-	return containsNode(entries, needle)
-}
-
-// TestHookStartupNodeStrictlyRequired runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool.
-// Why: exposes strict-node startup scoping so outage-recovery tests can confirm
-// non-core nodes stop blocking bootstrap.
-func (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool {
-	return o.startupNodeStrictlyRequired(node)
-}
-
-// TestHookStartupRequiredFluxKustomizations runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{}.
-// Why: exposes flux startup scoping so top-level tests can confirm only core
-// kustomizations block emergency bootstrap.
-func (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{} {
-	return o.startupRequiredFluxKustomizations()
-}
-
-// TestHookStartupRequiredWorkloadNamespaces runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{}.
-// Why: exposes workload namespace startup scoping so top-level tests can
-// confirm only core workloads block emergency bootstrap.
-func (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{} {
-	return o.startupRequiredWorkloadNamespaces()
-}
-
-// TestHookMaybeRunEarlyVaultUnseal runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context).
-// Why: exposes the early startup Vault deferral helper to top-level tests.
-func (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context) {
-	o.maybeRunEarlyVaultUnseal(ctx)
-}
-
-// TestHookRunStartupVaultUnsealGate runs one orchestration or CLI step.
-// Signature: (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error.
-// Why: exposes the startup Vault gate helper to top-level tests.
-func (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error {
-	return o.runStartupVaultUnsealGate(ctx)
-}
--- a/internal/config/apply_defaults.go
+++ b/internal/config/apply_defaults.go
@ -33,9 +33,6 @@ func (c *Config) applyDefaults() {
 	if c.Startup.NodeInventoryReachPollSeconds <= 0 {
 		c.Startup.NodeInventoryReachPollSeconds = 5
 	}
-	if c.Startup.NodeInventoryReachRequiredNodes == nil {
-		c.Startup.NodeInventoryReachRequiredNodes = []string{}
-	}
 	if c.Startup.RequiredNodeLabels == nil {
 		c.Startup.RequiredNodeLabels = map[string]map[string]string{
 			"titan-09": {
@ -124,11 +121,7 @@ func (c *Config) applyDefaults() {
 	if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
 		c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
 	}
-	if c.Startup.ServiceChecklistExplicitOnly {
-		c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, []ServiceChecklistCheck{})
-	} else {
-		c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
-	}
+	c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
 	for i := range c.Startup.ServiceChecklist {
 		if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
 			c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
@ -159,18 +152,12 @@ func (c *Config) applyDefaults() {
 	if c.Startup.NodeSSHAuthPollSeconds <= 0 {
 		c.Startup.NodeSSHAuthPollSeconds = 5
 	}
-	if c.Startup.NodeSSHAuthRequiredNodes == nil {
-		c.Startup.NodeSSHAuthRequiredNodes = []string{}
-	}
 	if c.Startup.FluxHealthWaitSeconds <= 0 {
 		c.Startup.FluxHealthWaitSeconds = 900
 	}
 	if c.Startup.FluxHealthPollSeconds <= 0 {
 		c.Startup.FluxHealthPollSeconds = 5
 	}
-	if c.Startup.FluxHealthRequiredKustomizations == nil {
-		c.Startup.FluxHealthRequiredKustomizations = []string{}
-	}
 	if c.Startup.IgnoreFluxKustomizations == nil {
 		c.Startup.IgnoreFluxKustomizations = []string{}
 	}
@ -180,9 +167,6 @@ func (c *Config) applyDefaults() {
 	if c.Startup.WorkloadConvergencePollSeconds <= 0 {
 		c.Startup.WorkloadConvergencePollSeconds = 5
 	}
-	if c.Startup.WorkloadConvergenceRequiredNamespaces == nil {
-		c.Startup.WorkloadConvergenceRequiredNamespaces = []string{}
-	}
 	if c.Startup.IgnoreWorkloadNamespaces == nil {
 		c.Startup.IgnoreWorkloadNamespaces = []string{}
 	}
@ -195,12 +179,6 @@ func (c *Config) applyDefaults() {
 	if c.Startup.StuckPodGraceSeconds <= 0 {
 		c.Startup.StuckPodGraceSeconds = 180
 	}
-	if c.Startup.PostStartAutoHealSeconds <= 0 {
-		c.Startup.PostStartAutoHealSeconds = 60
-	}
-	if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
-		c.Startup.DeadNodeCleanupGraceSeconds = 300
-	}
 	if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
 		c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
 	}
@ -243,12 +221,6 @@ func (c *Config) applyDefaults() {
 	if c.UPS.TelemetryTimeoutSeconds <= 0 {
 		c.UPS.TelemetryTimeoutSeconds = 90
 	}
-	if c.Startup.SchedulingStormEventThreshold <= 0 {
-		c.Startup.SchedulingStormEventThreshold = 30
-	}
-	if c.Startup.SchedulingStormWindowSeconds <= 0 {
-		c.Startup.SchedulingStormWindowSeconds = 180
-	}
 	if c.Coordination.ForwardShutdownConfig == "" {
 		c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
 	}
--- a/internal/config/defaults.go
+++ b/internal/config/defaults.go
@ -39,25 +39,24 @@ func defaults() Config {
 			"maintenance",
 		},
 		Startup: Startup{
-			APIWaitSeconds:                  1200,
-			APIPollSeconds:                  2,
-			ShutdownCooldownSeconds:         45,
-			RequireNodeInventoryReach:       true,
-			NodeInventoryReachWaitSeconds:   300,
-			NodeInventoryReachPollSeconds:   5,
-			NodeInventoryReachRequiredNodes: []string{},
-			RequireTimeSync:                 true,
-			TimeSyncWaitSeconds:             240,
-			TimeSyncPollSeconds:             5,
-			TimeSyncMode:                    "quorum",
-			TimeSyncQuorum:                  2,
-			ReconcileAccessOnBoot:           true,
-			AutoEtcdRestoreOnAPIFailure:     true,
-			EtcdRestoreControlPlane:         "titan-0a",
-			RequireStorageReady:             true,
-			StorageReadyWaitSeconds:         420,
-			StorageReadyPollSeconds:         5,
-			StorageMinReadyNodes:            2,
+			APIWaitSeconds:                1200,
+			APIPollSeconds:                2,
+			ShutdownCooldownSeconds:       45,
+			RequireNodeInventoryReach:     true,
+			NodeInventoryReachWaitSeconds: 300,
+			NodeInventoryReachPollSeconds: 5,
+			RequireTimeSync:               true,
+			TimeSyncWaitSeconds:           240,
+			TimeSyncPollSeconds:           5,
+			TimeSyncMode:                  "quorum",
+			TimeSyncQuorum:                2,
+			ReconcileAccessOnBoot:         true,
+			AutoEtcdRestoreOnAPIFailure:   true,
+			EtcdRestoreControlPlane:       "titan-0a",
+			RequireStorageReady:           true,
+			StorageReadyWaitSeconds:       420,
+			StorageReadyPollSeconds:       5,
+			StorageMinReadyNodes:          2,
 			StorageCriticalPVCs: []string{
 				"vault/data-vault-0",
 				"postgres/postgres-data-postgres-0",
@ -92,36 +91,33 @@ func defaults() Config {
 				AdminSecretUsernameKey: "username",
 				AdminSecretPasswordKey: "password",
 			},
-			ServiceChecklist:                      defaultServiceChecklist(),
-			RequireCriticalServiceEndpoints:       true,
-			CriticalServiceEndpointWaitSec:        420,
-			CriticalServiceEndpointPollSec:        5,
-			CriticalServiceEndpoints:              defaultCriticalServiceEndpoints(),
-			RequireIngressChecklist:               true,
-			IngressChecklistWaitSeconds:           420,
-			IngressChecklistPollSeconds:           5,
-			IngressChecklistAccepted:              []int{200, 301, 302, 307, 308, 401, 403, 404},
-			IngressChecklistIgnoreHosts:           []string{},
-			RequireNodeSSHAuth:                    true,
-			NodeSSHAuthWaitSeconds:                240,
-			NodeSSHAuthPollSeconds:                5,
-			NodeSSHAuthRequiredNodes:              []string{},
-			RequireFluxHealth:                     true,
-			FluxHealthWaitSeconds:                 900,
-			FluxHealthPollSeconds:                 5,
-			FluxHealthRequiredKustomizations:      []string{},
-			IgnoreFluxKustomizations:              []string{},
-			RequireWorkloadConvergence:            true,
-			WorkloadConvergenceWaitSeconds:        900,
-			WorkloadConvergencePollSeconds:        5,
-			WorkloadConvergenceRequiredNamespaces: []string{},
-			IgnoreWorkloadNamespaces:              []string{},
-			IgnoreWorkloads:                       []string{},
-			IgnoreUnavailableNodes:                []string{},
-			AutoRecycleStuckPods:                  true,
-			StuckPodGraceSeconds:                  180,
-			VaultUnsealKeyFile:                    "/var/lib/ananke/vault-unseal.key",
-			VaultUnsealBreakglassTimeout:          15,
+			ServiceChecklist:                defaultServiceChecklist(),
+			RequireCriticalServiceEndpoints: true,
+			CriticalServiceEndpointWaitSec:  420,
+			CriticalServiceEndpointPollSec:  5,
+			CriticalServiceEndpoints:        defaultCriticalServiceEndpoints(),
+			RequireIngressChecklist:         true,
+			IngressChecklistWaitSeconds:     420,
+			IngressChecklistPollSeconds:     5,
+			IngressChecklistAccepted:        []int{200, 301, 302, 307, 308, 401, 403, 404},
+			IngressChecklistIgnoreHosts:     []string{},
+			RequireNodeSSHAuth:              true,
+			NodeSSHAuthWaitSeconds:          240,
+			NodeSSHAuthPollSeconds:          5,
+			RequireFluxHealth:               true,
+			FluxHealthWaitSeconds:           900,
+			FluxHealthPollSeconds:           5,
+			IgnoreFluxKustomizations:        []string{},
+			RequireWorkloadConvergence:      true,
+			WorkloadConvergenceWaitSeconds:  900,
+			WorkloadConvergencePollSeconds:  5,
+			IgnoreWorkloadNamespaces:        []string{},
+			IgnoreWorkloads:                 []string{},
+			IgnoreUnavailableNodes:          []string{},
+			AutoRecycleStuckPods:            true,
+			StuckPodGraceSeconds:            180,
+			VaultUnsealKeyFile:              "/var/lib/ananke/vault-unseal.key",
+			VaultUnsealBreakglassTimeout:    15,
 		},
 		Shutdown: Shutdown{
 			DefaultBudgetSeconds: 1380,
--- a/internal/config/load_additional_test.go
+++ b/internal/config/load_additional_test.go
@ -51,41 +51,3 @@ startup:
 		t.Fatalf("expected validation failure")
 	}
 }
-
-// TestLoadKeepsExplicitServiceChecklist runs one orchestration or CLI step.
-// Signature: TestLoadKeepsExplicitServiceChecklist(t *testing.T).
-// Why: host recovery configs must be able to keep a narrow, explicit checklist
-// without silently inheriting the full default service catalog.
-func TestLoadKeepsExplicitServiceChecklist(t *testing.T) {
-	cfgPath := filepath.Join(t.TempDir(), "ananke.yaml")
-	raw := `
-control_planes: [titan-0a]
-expected_flux_branch: main
-expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
-iac_repo_path: /opt/titan-iac
-startup:
-  service_checklist_explicit_only: true
-  service_checklist:
-    - name: gitea-api
-      url: https://scm.bstein.dev/api/healthz
-      accepted_statuses: [200]
-      body_contains: pass
-      timeout_seconds: 12
-ups:
-  enabled: false
-`
-	if err := os.WriteFile(cfgPath, []byte(raw), 0o644); err != nil {
-		t.Fatalf("write config: %v", err)
-	}
-
-	cfg, err := Load(cfgPath)
-	if err != nil {
-		t.Fatalf("load config: %v", err)
-	}
-	if len(cfg.Startup.ServiceChecklist) != 1 {
-		t.Fatalf("expected 1 explicit service check, got %d", len(cfg.Startup.ServiceChecklist))
-	}
-	if cfg.Startup.ServiceChecklist[0].Name != "gitea-api" {
-		t.Fatalf("expected explicit gitea-api check, got %q", cfg.Startup.ServiceChecklist[0].Name)
-	}
-}
--- a/internal/config/types.go
+++ b/internal/config/types.go
@ -27,75 +27,65 @@ type Config struct {
 }

 type Startup struct {
-	APIWaitSeconds                        int                          `yaml:"api_wait_seconds"`
-	APIPollSeconds                        int                          `yaml:"api_poll_seconds"`
-	ShutdownCooldownSeconds               int                          `yaml:"shutdown_cooldown_seconds"`
-	MinimumBatteryPercent                 float64                      `yaml:"minimum_battery_percent"`
-	RequireNodeInventoryReach             bool                         `yaml:"require_node_inventory_reachability"`
-	NodeInventoryReachWaitSeconds         int                          `yaml:"node_inventory_reachability_wait_seconds"`
-	NodeInventoryReachPollSeconds         int                          `yaml:"node_inventory_reachability_poll_seconds"`
-	NodeInventoryReachRequiredNodes       []string                     `yaml:"node_inventory_reachability_required_nodes"`
-	RequiredNodeLabels                    map[string]map[string]string `yaml:"required_node_labels"`
-	RequireTimeSync                       bool                         `yaml:"require_time_sync"`
-	TimeSyncWaitSeconds                   int                          `yaml:"time_sync_wait_seconds"`
-	TimeSyncPollSeconds                   int                          `yaml:"time_sync_poll_seconds"`
-	TimeSyncMode                          string                       `yaml:"time_sync_mode"`
-	TimeSyncQuorum                        int                          `yaml:"time_sync_quorum"`
-	ReconcileAccessOnBoot                 bool                         `yaml:"reconcile_access_on_boot"`
-	AutoEtcdRestoreOnAPIFailure           bool                         `yaml:"auto_etcd_restore_on_api_failure"`
-	EtcdRestoreControlPlane               string                       `yaml:"etcd_restore_control_plane"`
-	RequireStorageReady                   bool                         `yaml:"require_storage_ready"`
-	StorageReadyWaitSeconds               int                          `yaml:"storage_ready_wait_seconds"`
-	StorageReadyPollSeconds               int                          `yaml:"storage_ready_poll_seconds"`
-	StorageMinReadyNodes                  int                          `yaml:"storage_min_ready_nodes"`
-	StorageCriticalPVCs                   []string                     `yaml:"storage_critical_pvcs"`
-	RequirePostStartProbes                bool                         `yaml:"require_post_start_probes"`
-	PostStartProbeWaitSeconds             int                          `yaml:"post_start_probe_wait_seconds"`
-	PostStartProbePollSeconds             int                          `yaml:"post_start_probe_poll_seconds"`
-	PostStartProbes                       []string                     `yaml:"post_start_probes"`
-	RequireServiceChecklist               bool                         `yaml:"require_service_checklist"`
-	ServiceChecklistWaitSeconds           int                          `yaml:"service_checklist_wait_seconds"`
-	ServiceChecklistPollSeconds           int                          `yaml:"service_checklist_poll_seconds"`
-	ServiceChecklistStabilitySec          int                          `yaml:"service_checklist_stability_seconds"`
-	ServiceChecklistAuth                  ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
-	ServiceChecklistExplicitOnly          bool                         `yaml:"service_checklist_explicit_only"`
-	ServiceChecklist                      []ServiceChecklistCheck      `yaml:"service_checklist"`
-	RequireCriticalServiceEndpoints       bool                         `yaml:"require_critical_service_endpoints"`
-	CriticalServiceEndpointWaitSec        int                          `yaml:"critical_service_endpoint_wait_seconds"`
-	CriticalServiceEndpointPollSec        int                          `yaml:"critical_service_endpoint_poll_seconds"`
-	CriticalServiceEndpoints              []string                     `yaml:"critical_service_endpoints"`
-	RequireIngressChecklist               bool                         `yaml:"require_ingress_checklist"`
-	IngressChecklistWaitSeconds           int                          `yaml:"ingress_checklist_wait_seconds"`
-	IngressChecklistPollSeconds           int                          `yaml:"ingress_checklist_poll_seconds"`
-	IngressChecklistAccepted              []int                        `yaml:"ingress_checklist_accepted_statuses"`
-	IngressChecklistIgnoreHosts           []string                     `yaml:"ingress_checklist_ignore_hosts"`
-	IngressChecklistInsecureSkip          bool                         `yaml:"ingress_checklist_insecure_skip_tls"`
-	RequireNodeSSHAuth                    bool                         `yaml:"require_node_ssh_auth"`
-	NodeSSHAuthWaitSeconds                int                          `yaml:"node_ssh_auth_wait_seconds"`
-	NodeSSHAuthPollSeconds                int                          `yaml:"node_ssh_auth_poll_seconds"`
-	NodeSSHAuthRequiredNodes              []string                     `yaml:"node_ssh_auth_required_nodes"`
-	RequireFluxHealth                     bool                         `yaml:"require_flux_health"`
-	FluxHealthWaitSeconds                 int                          `yaml:"flux_health_wait_seconds"`
-	FluxHealthPollSeconds                 int                          `yaml:"flux_health_poll_seconds"`
-	FluxHealthRequiredKustomizations      []string                     `yaml:"flux_health_required_kustomizations"`
-	IgnoreFluxKustomizations              []string                     `yaml:"ignore_flux_kustomizations"`
-	RequireWorkloadConvergence            bool                         `yaml:"require_workload_convergence"`
-	WorkloadConvergenceWaitSeconds        int                          `yaml:"workload_convergence_wait_seconds"`
-	WorkloadConvergencePollSeconds        int                          `yaml:"workload_convergence_poll_seconds"`
-	WorkloadConvergenceRequiredNamespaces []string                     `yaml:"workload_convergence_required_namespaces"`
-	IgnoreWorkloadNamespaces              []string                     `yaml:"ignore_workload_namespaces"`
-	IgnoreWorkloads                       []string                     `yaml:"ignore_workloads"`
-	IgnoreUnavailableNodes                []string                     `yaml:"ignore_unavailable_nodes"`
-	AutoRecycleStuckPods                  bool                         `yaml:"auto_recycle_stuck_pods"`
-	AutoQuarantineSchedulingStorms        bool                         `yaml:"auto_quarantine_scheduling_storms"`
-	SchedulingStormEventThreshold         int                          `yaml:"scheduling_storm_event_threshold"`
-	SchedulingStormWindowSeconds          int                          `yaml:"scheduling_storm_window_seconds"`
-	StuckPodGraceSeconds                  int                          `yaml:"stuck_pod_grace_seconds"`
-	PostStartAutoHealSeconds              int                          `yaml:"post_start_auto_heal_seconds"`
-	DeadNodeCleanupGraceSeconds           int                          `yaml:"dead_node_cleanup_grace_seconds"`
-	VaultUnsealKeyFile                    string                       `yaml:"vault_unseal_key_file"`
-	VaultUnsealBreakglassCommand          string                       `yaml:"vault_unseal_breakglass_command"`
-	VaultUnsealBreakglassTimeout          int                          `yaml:"vault_unseal_breakglass_timeout_seconds"`
+	APIWaitSeconds                  int                          `yaml:"api_wait_seconds"`
+	APIPollSeconds                  int                          `yaml:"api_poll_seconds"`
+	ShutdownCooldownSeconds         int                          `yaml:"shutdown_cooldown_seconds"`
+	MinimumBatteryPercent           float64                      `yaml:"minimum_battery_percent"`
+	RequireNodeInventoryReach       bool                         `yaml:"require_node_inventory_reachability"`
+	NodeInventoryReachWaitSeconds   int                          `yaml:"node_inventory_reachability_wait_seconds"`
+	NodeInventoryReachPollSeconds   int                          `yaml:"node_inventory_reachability_poll_seconds"`
+	RequiredNodeLabels              map[string]map[string]string `yaml:"required_node_labels"`
+	RequireTimeSync                 bool                         `yaml:"require_time_sync"`
+	TimeSyncWaitSeconds             int                          `yaml:"time_sync_wait_seconds"`
+	TimeSyncPollSeconds             int                          `yaml:"time_sync_poll_seconds"`
+	TimeSyncMode                    string                       `yaml:"time_sync_mode"`
+	TimeSyncQuorum                  int                          `yaml:"time_sync_quorum"`
+	ReconcileAccessOnBoot           bool                         `yaml:"reconcile_access_on_boot"`
+	AutoEtcdRestoreOnAPIFailure     bool                         `yaml:"auto_etcd_restore_on_api_failure"`
+	EtcdRestoreControlPlane         string                       `yaml:"etcd_restore_control_plane"`
+	RequireStorageReady             bool                         `yaml:"require_storage_ready"`
+	StorageReadyWaitSeconds         int                          `yaml:"storage_ready_wait_seconds"`
+	StorageReadyPollSeconds         int                          `yaml:"storage_ready_poll_seconds"`
+	StorageMinReadyNodes            int                          `yaml:"storage_min_ready_nodes"`
+	StorageCriticalPVCs             []string                     `yaml:"storage_critical_pvcs"`
+	RequirePostStartProbes          bool                         `yaml:"require_post_start_probes"`
+	PostStartProbeWaitSeconds       int                          `yaml:"post_start_probe_wait_seconds"`
+	PostStartProbePollSeconds       int                          `yaml:"post_start_probe_poll_seconds"`
+	PostStartProbes                 []string                     `yaml:"post_start_probes"`
+	RequireServiceChecklist         bool                         `yaml:"require_service_checklist"`
+	ServiceChecklistWaitSeconds     int                          `yaml:"service_checklist_wait_seconds"`
+	ServiceChecklistPollSeconds     int                          `yaml:"service_checklist_poll_seconds"`
+	ServiceChecklistStabilitySec    int                          `yaml:"service_checklist_stability_seconds"`
+	ServiceChecklistAuth            ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
+	ServiceChecklist                []ServiceChecklistCheck      `yaml:"service_checklist"`
+	RequireCriticalServiceEndpoints bool                         `yaml:"require_critical_service_endpoints"`
+	CriticalServiceEndpointWaitSec  int                          `yaml:"critical_service_endpoint_wait_seconds"`
+	CriticalServiceEndpointPollSec  int                          `yaml:"critical_service_endpoint_poll_seconds"`
+	CriticalServiceEndpoints        []string                     `yaml:"critical_service_endpoints"`
+	RequireIngressChecklist         bool                         `yaml:"require_ingress_checklist"`
+	IngressChecklistWaitSeconds     int                          `yaml:"ingress_checklist_wait_seconds"`
+	IngressChecklistPollSeconds     int                          `yaml:"ingress_checklist_poll_seconds"`
+	IngressChecklistAccepted        []int                        `yaml:"ingress_checklist_accepted_statuses"`
+	IngressChecklistIgnoreHosts     []string                     `yaml:"ingress_checklist_ignore_hosts"`
+	IngressChecklistInsecureSkip    bool                         `yaml:"ingress_checklist_insecure_skip_tls"`
+	RequireNodeSSHAuth              bool                         `yaml:"require_node_ssh_auth"`
+	NodeSSHAuthWaitSeconds          int                          `yaml:"node_ssh_auth_wait_seconds"`
+	NodeSSHAuthPollSeconds          int                          `yaml:"node_ssh_auth_poll_seconds"`
+	RequireFluxHealth               bool                         `yaml:"require_flux_health"`
+	FluxHealthWaitSeconds           int                          `yaml:"flux_health_wait_seconds"`
+	FluxHealthPollSeconds           int                          `yaml:"flux_health_poll_seconds"`
+	IgnoreFluxKustomizations        []string                     `yaml:"ignore_flux_kustomizations"`
+	RequireWorkloadConvergence      bool                         `yaml:"require_workload_convergence"`
+	WorkloadConvergenceWaitSeconds  int                          `yaml:"workload_convergence_wait_seconds"`
+	WorkloadConvergencePollSeconds  int                          `yaml:"workload_convergence_poll_seconds"`
+	IgnoreWorkloadNamespaces        []string                     `yaml:"ignore_workload_namespaces"`
+	IgnoreWorkloads                 []string                     `yaml:"ignore_workloads"`
+	IgnoreUnavailableNodes          []string                     `yaml:"ignore_unavailable_nodes"`
+	AutoRecycleStuckPods            bool                         `yaml:"auto_recycle_stuck_pods"`
+	StuckPodGraceSeconds            int                          `yaml:"stuck_pod_grace_seconds"`
+	VaultUnsealKeyFile              string                       `yaml:"vault_unseal_key_file"`
+	VaultUnsealBreakglassCommand    string                       `yaml:"vault_unseal_breakglass_command"`
+	VaultUnsealBreakglassTimeout    int                          `yaml:"vault_unseal_breakglass_timeout_seconds"`
 }

 type ServiceChecklistCheck struct {
@ -146,7 +136,6 @@ type UPS struct {
 	Targets                 []UPSTarget `yaml:"targets"`
 	PollSeconds             int         `yaml:"poll_seconds"`
 	RuntimeSafetyFactor     float64     `yaml:"runtime_safety_factor"`
-	OnBatteryGraceSeconds   int         `yaml:"on_battery_grace_seconds"`
 	DebounceCount           int         `yaml:"debounce_count"`
 	TelemetryTimeoutSeconds int         `yaml:"telemetry_timeout_seconds"`
 }
--- a/internal/config/validate.go
+++ b/internal/config/validate.go
@ -61,11 +61,6 @@ func (c Config) Validate() error {
 	if c.Startup.NodeInventoryReachPollSeconds <= 0 {
 		return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
 	}
-	for _, node := range c.Startup.NodeInventoryReachRequiredNodes {
-		if strings.TrimSpace(node) == "" {
-			return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
-		}
-	}
 	for node, labels := range c.Startup.RequiredNodeLabels {
 		if strings.TrimSpace(node) == "" {
 			return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
@ -238,46 +233,21 @@ func (c Config) Validate() error {
 	if c.Startup.NodeSSHAuthPollSeconds <= 0 {
 		return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
 	}
-	for _, node := range c.Startup.NodeSSHAuthRequiredNodes {
-		if strings.TrimSpace(node) == "" {
-			return fmt.Errorf("config.startup.node_ssh_auth_required_nodes entries must not be empty")
-		}
-	}
 	if c.Startup.FluxHealthWaitSeconds <= 0 {
 		return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
 	}
 	if c.Startup.FluxHealthPollSeconds <= 0 {
 		return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
 	}
-	for _, item := range c.Startup.FluxHealthRequiredKustomizations {
-		item = strings.TrimSpace(item)
-		if item == "" {
-			return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must not be empty")
-		}
-		if strings.Count(item, "/") != 1 {
-			return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must be namespace/name, got %q", item)
-		}
-	}
 	if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
 		return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
 	}
 	if c.Startup.WorkloadConvergencePollSeconds <= 0 {
 		return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
 	}
-	for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
-		if strings.TrimSpace(ns) == "" {
-			return fmt.Errorf("config.startup.workload_convergence_required_namespaces entries must not be empty")
-		}
-	}
 	if c.Startup.StuckPodGraceSeconds <= 0 {
 		return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
 	}
-	if c.Startup.PostStartAutoHealSeconds <= 0 {
-		return fmt.Errorf("config.startup.post_start_auto_heal_seconds must be > 0")
-	}
-	if c.Startup.DeadNodeCleanupGraceSeconds <= 0 {
-		return fmt.Errorf("config.startup.dead_node_cleanup_grace_seconds must be > 0")
-	}
 	for _, probe := range c.Startup.PostStartProbes {
 		if strings.TrimSpace(probe) == "" {
 			return fmt.Errorf("config.startup.post_start_probes entries must not be empty")
@ -307,16 +277,6 @@ func (c Config) Validate() error {
 			return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
 		}
 	}
-	for _, item := range c.Startup.FluxHealthRequiredKustomizations {
-		if containsTrimmed(c.Startup.IgnoreFluxKustomizations, item) {
-			return fmt.Errorf("config.startup.flux_health_required_kustomizations must not overlap ignore_flux_kustomizations (%q)", strings.TrimSpace(item))
-		}
-	}
-	for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
-		if containsTrimmed(c.Startup.IgnoreWorkloadNamespaces, ns) {
-			return fmt.Errorf("config.startup.workload_convergence_required_namespaces must not overlap ignore_workload_namespaces (%q)", strings.TrimSpace(ns))
-		}
-	}
 	for _, node := range c.Startup.IgnoreUnavailableNodes {
 		if strings.TrimSpace(node) == "" {
 			return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
@ -332,9 +292,6 @@ func (c Config) Validate() error {
 		if c.UPS.Provider == "" {
 			return fmt.Errorf("config.ups.provider must not be empty when ups is enabled")
 		}
-		if c.UPS.OnBatteryGraceSeconds < 0 {
-			return fmt.Errorf("config.ups.on_battery_grace_seconds must be >= 0")
-		}
 		if c.UPS.Target == "" && len(c.UPS.Targets) == 0 {
 			return fmt.Errorf("config.ups.target or config.ups.targets must be set when ups is enabled")
 		}
@ -349,14 +306,6 @@ func (c Config) Validate() error {
 			return fmt.Errorf("config.coordination.forward_shutdown_config must not be empty when forward_shutdown_host is set")
 		}
 	}
-	if c.Startup.AutoQuarantineSchedulingStorms {
-		if c.Startup.SchedulingStormEventThreshold <= 0 {
-			return fmt.Errorf("config.startup.scheduling_storm_event_threshold must be > 0 when auto_quarantine_scheduling_storms is enabled")
-		}
-		if c.Startup.SchedulingStormWindowSeconds <= 0 {
-			return fmt.Errorf("config.startup.scheduling_storm_window_seconds must be > 0 when auto_quarantine_scheduling_storms is enabled")
-		}
-	}
 	for _, peer := range c.Coordination.PeerHosts {
 		if strings.TrimSpace(peer) == "" {
 			return fmt.Errorf("config.coordination.peer_hosts entries must not be empty")
@ -379,20 +328,3 @@ func (c Config) Validate() error {
 	}
 	return nil
 }
-
-// containsTrimmed runs one orchestration or CLI step.
-// Signature: containsTrimmed(entries []string, needle string) bool.
-// Why: startup config now supports both required and ignored recovery scopes, so
-// validation needs a single normalized overlap check for those lists.
-func containsTrimmed(entries []string, needle string) bool {
-	needle = strings.TrimSpace(needle)
-	if needle == "" {
-		return false
-	}
-	for _, entry := range entries {
-		if strings.TrimSpace(entry) == needle {
-			return true
-		}
-	}
-	return false
-}
--- a/internal/config/validate_matrix_test.go
+++ b/internal/config/validate_matrix_test.go
@ -30,7 +30,6 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
 		{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
 		{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
 		{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
-		{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
 		{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
 		{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
 		{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
@ -69,42 +68,19 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
 		{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
 		{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
 		{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
-		{"bad_empty_node_ssh_required_node", func(c *Config) { c.Startup.NodeSSHAuthRequiredNodes = []string{"titan-0a", ""} }},
 		{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
 		{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
-		{"bad_empty_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", ""} }},
-		{"bad_malformed_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system-core"} }},
 		{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
 		{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
-		{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
 		{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
-		{"bad_post_start_auto_heal_seconds", func(c *Config) { c.Startup.PostStartAutoHealSeconds = 0 }},
-		{"bad_dead_node_cleanup_grace_seconds", func(c *Config) { c.Startup.DeadNodeCleanupGraceSeconds = 0 }},
 		{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
 		{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
 		{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
 		{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
-		{"bad_overlap_flux_required_and_ignored", func(c *Config) {
-			c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core"}
-			c.Startup.IgnoreFluxKustomizations = []string{"flux-system/core"}
-		}},
-		{"bad_overlap_workload_required_and_ignored", func(c *Config) {
-			c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
-			c.Startup.IgnoreWorkloadNamespaces = []string{"monitoring"}
-		}},
 		{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
 		{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
-		{"bad_scheduling_storm_threshold", func(c *Config) {
-			c.Startup.AutoQuarantineSchedulingStorms = true
-			c.Startup.SchedulingStormEventThreshold = 0
-		}},
-		{"bad_scheduling_storm_window", func(c *Config) {
-			c.Startup.AutoQuarantineSchedulingStorms = true
-			c.Startup.SchedulingStormWindowSeconds = 0
-		}},
 		{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
 		{"bad_ups_provider", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "" }},
-		{"bad_ups_on_battery_grace_negative", func(c *Config) { c.UPS.Enabled = true; c.UPS.OnBatteryGraceSeconds = -1 }},
 		{"bad_ups_target_empty", func(c *Config) { c.UPS.Enabled = true; c.UPS.Provider = "nut"; c.UPS.Target = ""; c.UPS.Targets = nil }},
 		{"bad_ups_targets_item_empty", func(c *Config) {
 			c.UPS.Enabled = true
@ -145,13 +121,6 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) {
 	if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
 		t.Fatalf("expected startup defaults to be set")
 	}
-	if cfg.Startup.PostStartAutoHealSeconds <= 0 || cfg.Startup.DeadNodeCleanupGraceSeconds <= 0 {
-		t.Fatalf("expected post-start auto-heal defaults to be set")
-	}
-	if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
-		cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
-		t.Fatalf("expected startup recovery scope slices to be initialized")
-	}
 	if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
 		t.Fatalf("expected critical service endpoint timing defaults to be set")
 	}
--- a/internal/service/daemon.go
+++ b/internal/service/daemon.go
@ -32,8 +32,6 @@ type Daemon struct {
 	targets  []Target
 	log      *log.Logger
 	exporter *metrics.Exporter
-
-	postStartAutoHealOverride func(context.Context) error
 }

 var sshConfigCandidates = []string{
@ -94,9 +92,7 @@ func (d *Daemon) Run(ctx context.Context) error {

 	lastGood := map[string]time.Time{}
 	lastOnBattery := map[string]bool{}
-	onBatterySince := map[string]time.Time{}
 	breachCount := map[string]int{}
-	lastAutoHeal := time.Time{}
 	for _, t := range d.targets {
 		lastGood[t.Name] = time.Now()
 	}
@ -111,16 +107,12 @@ func (d *Daemon) Run(ctx context.Context) error {
 		case <-t.C:
 			budget := d.orch.EstimatedEmergencyShutdownSeconds()
 			threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor))
-			anyOnBattery := false

 			d.exporter.UpdateBudget(budget)

 			for _, target := range d.targets {
 				sample, err := target.Provider.Read(ctx)
 				if err != nil {
-					if lastOnBattery[target.Name] {
-						anyOnBattery = true
-					}
 					d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err)
 					d.exporter.UpdateSample(metrics.Sample{
 						Name:         target.Name,
@ -139,45 +131,17 @@ func (d *Daemon) Run(ctx context.Context) error {
 				}

 				lastGood[target.Name] = time.Now()
-				if sample.OnBattery {
-					anyOnBattery = true
-				}
-				wasOnBattery := lastOnBattery[target.Name]
-				if sample.OnBattery {
-					if !wasOnBattery || onBatterySince[target.Name].IsZero() {
-						onBatterySince[target.Name] = time.Now()
-					}
-				} else {
-					onBatterySince[target.Name] = time.Time{}
-				}
 				lastOnBattery[target.Name] = sample.OnBattery

-				onBatteryElapsed := 0
-				if sample.OnBattery && !onBatterySince[target.Name].IsZero() {
-					onBatteryElapsed = int(time.Since(onBatterySince[target.Name]).Seconds())
-				}
-
-				trigger := false
-				triggerReason := ""
-				switch {
-				case sample.LowBattery:
-					trigger = true
-					triggerReason = fmt.Sprintf("ups-low-battery target=%s status=%s", target.Name, sample.RawStatus)
-				case sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold:
-					trigger = true
-					triggerReason = fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
-				case sample.OnBattery && d.cfg.UPS.OnBatteryGraceSeconds > 0 && onBatteryElapsed >= d.cfg.UPS.OnBatteryGraceSeconds:
-					trigger = true
-					triggerReason = fmt.Sprintf("ups-on-battery target=%s elapsed=%ds grace=%ds status=%s", target.Name, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RawStatus)
-				}
+				trigger := sample.LowBattery || (sample.OnBattery && sample.RuntimeSeconds > 0 && sample.RuntimeSeconds <= threshold)
 				if trigger {
 					breachCount[target.Name]++
 				} else {
 					breachCount[target.Name] = 0
 				}

-				d.log.Printf("ups target=%s status=%s on_battery=%t on_battery_s=%d grace_s=%d runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
-					target.Name, sample.RawStatus, sample.OnBattery, onBatteryElapsed, d.cfg.UPS.OnBatteryGraceSeconds, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])
+				d.log.Printf("ups target=%s status=%s on_battery=%t runtime_s=%d threshold_s=%d budget_s=%d trigger=%t breach=%d",
+					target.Name, sample.RawStatus, sample.OnBattery, sample.RuntimeSeconds, threshold, budget, trigger, breachCount[target.Name])

 				d.exporter.UpdateSample(metrics.Sample{
 					Name:          target.Name,
@ -196,54 +160,14 @@ func (d *Daemon) Run(ctx context.Context) error {
 				})

 				if breachCount[target.Name] >= debounce {
-					return d.triggerShutdown(ctx, triggerReason)
+					reason := fmt.Sprintf("ups-threshold target=%s runtime=%ds threshold=%ds status=%s", target.Name, sample.RuntimeSeconds, threshold, sample.RawStatus)
+					return d.triggerShutdown(ctx, reason)
 				}
 			}
-
-			d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery)
 		}
 	}
 }

-// maybeRunPostStartAutoHeal runs one orchestration or CLI step.
-// Signature: (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool).
-// Why: gives the long-running daemon a bounded path to repair post-start drift
-// like a later Vault reseal or stale dead-node deletions without waiting for a
-// fresh bootstrap run.
-func (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool) {
-	interval := time.Duration(d.cfg.Startup.PostStartAutoHealSeconds) * time.Second
-	if interval <= 0 || anyOnBattery {
-		return
-	}
-	if d.orch == nil && d.postStartAutoHealOverride == nil {
-		return
-	}
-	now := time.Now()
-	if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval {
-		return
-	}
-	if lastRun != nil {
-		*lastRun = now
-	}
-	if err := d.runPostStartAutoHeal(ctx); err != nil {
-		d.log.Printf("warning: post-start auto-heal: %v", err)
-	}
-}
-
-// runPostStartAutoHeal runs one orchestration or CLI step.
-// Signature: (d *Daemon) runPostStartAutoHeal(ctx context.Context) error.
-// Why: keeps the daemon loop readable while allowing unit tests to inject a
-// deterministic repair hook without a live cluster.
-func (d *Daemon) runPostStartAutoHeal(ctx context.Context) error {
-	if d.postStartAutoHealOverride != nil {
-		return d.postStartAutoHealOverride(ctx)
-	}
-	if d.orch == nil {
-		return nil
-	}
-	return d.orch.RunPostStartAutoHeal(ctx)
-}
-
 // triggerShutdown runs one orchestration or CLI step.
 // Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
--- a/internal/service/daemon_additional_test.go
+++ b/internal/service/daemon_additional_test.go
@ -165,50 +165,6 @@ func TestDaemonRunTriggersShutdownOnTelemetryTimeout(t *testing.T) {
 	}
 }

-// TestDaemonRunTriggersShutdownAfterOnBatteryGrace runs one orchestration or CLI step.
-// Signature: TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T).
-// Why: covers the sustained-on-battery trigger so short runtime estimates are not
-// the only path to a graceful shutdown during abrupt power loss.
-func TestDaemonRunTriggersShutdownAfterOnBatteryGrace(t *testing.T) {
-	stateDir := t.TempDir()
-	orch := newDaemonTestOrchestrator(t, stateDir)
-	d := &Daemon{
-		cfg: config.Config{
-			UPS: config.UPS{
-				Enabled:               true,
-				PollSeconds:           1,
-				DebounceCount:         1,
-				RuntimeSafetyFactor:   1.0,
-				OnBatteryGraceSeconds: 1,
-			},
-			State: config.State{
-				IntentPath: filepath.Join(stateDir, "intent.json"),
-			},
-			Shutdown: config.Shutdown{
-				EmergencySkipDrain: true,
-				EmergencySkipEtcd:  true,
-			},
-		},
-		orch: orch,
-		targets: []Target{
-			{
-				Name:   "Pyrphoros",
-				Target: "pyrphoros@localhost",
-				Provider: &daemonFakeProvider{
-					samples: []ups.Sample{{OnBattery: true, LowBattery: false, RuntimeSeconds: 9999, RawStatus: "OB"}},
-				},
-			},
-		},
-		log:      log.New(io.Discard, "", 0),
-		exporter: metrics.New(),
-	}
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-	defer cancel()
-	if err := d.Run(ctx); err != nil {
-		t.Fatalf("expected sustained-on-battery shutdown path to complete, got %v", err)
-	}
-}
-
 // TestForwardShutdownSucceedsWithSSHShim runs one orchestration or CLI step.
 // Signature: TestForwardShutdownSucceedsWithSSHShim(t *testing.T).
 // Why: covers forward-shutdown SSH execution path.
--- a/internal/service/daemon_poststart_autorepair_test.go
+++ b/internal/service/daemon_poststart_autorepair_test.go
@ -1,51 +0,0 @@
-package service
-
-import (
-	"context"
-	"testing"
-	"time"
-
-	"scm.bstein.dev/bstein/ananke/internal/config"
-)
-
-// TestDaemonMaybeRunPostStartAutoHeal runs one orchestration or CLI step.
-// Signature: TestDaemonMaybeRunPostStartAutoHeal(t *testing.T).
-// Why: covers the daemon-side interval and on-battery guards for the new
-// post-start repair loop.
-func TestDaemonMaybeRunPostStartAutoHeal(t *testing.T) {
-	calls := 0
-	d := &Daemon{
-		cfg: config.Config{
-			Startup: config.Startup{
-				PostStartAutoHealSeconds: 10,
-			},
-		},
-		postStartAutoHealOverride: func(context.Context) error {
-			calls++
-			return nil
-		},
-	}
-
-	var last time.Time
-	d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
-	if calls != 1 {
-		t.Fatalf("expected first auto-heal invocation, got %d", calls)
-	}
-
-	d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
-	if calls != 1 {
-		t.Fatalf("expected interval guard to suppress second call, got %d", calls)
-	}
-
-	last = time.Now().Add(-11 * time.Second)
-	d.maybeRunPostStartAutoHeal(context.Background(), &last, true)
-	if calls != 1 {
-		t.Fatalf("expected on-battery guard to suppress call, got %d", calls)
-	}
-
-	last = time.Now().Add(-11 * time.Second)
-	d.maybeRunPostStartAutoHeal(context.Background(), &last, false)
-	if calls != 2 {
-		t.Fatalf("expected second allowed auto-heal call, got %d", calls)
-	}
-}
--- a/internal/state/intent.go
+++ b/internal/state/intent.go
@ -22,23 +22,12 @@ type Intent struct {
 	UpdatedAt time.Time `json:"updated_at"`
 }

-var (
-	readIntentImpl  = readIntentDefault
-	writeIntentImpl = writeIntentDefault
-)
+var writeIntentImpl = writeIntentDefault

 // ReadIntent runs one orchestration or CLI step.
 // Signature: ReadIntent(path string) (Intent, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func ReadIntent(path string) (Intent, error) {
-	return readIntentImpl(path)
-}
-
-// readIntentDefault runs one orchestration or CLI step.
-// Signature: readIntentDefault(path string) (Intent, error).
-// Why: keeps production read behavior available while tests can override intent
-// reads deterministically without racing background file mutations.
-func readIntentDefault(path string) (Intent, error) {
 	b, err := os.ReadFile(path)
 	if err != nil {
 		if os.IsNotExist(err) {
--- a/internal/state/testhooks.go
+++ b/internal/state/testhooks.go
@ -22,34 +22,6 @@ func TestHookWriteIntentDefault(path string, in Intent) error {
 	return writeIntentDefault(path, in)
 }

-// TestHookReadIntentDefault runs one orchestration or CLI step.
-// Signature: TestHookReadIntentDefault(path string) (Intent, error).
-// Why: lets top-level tests delegate to production ReadIntent behavior while
-// selectively forcing deterministic read sequences for lifecycle branches.
-func TestHookReadIntentDefault(path string) (Intent, error) {
-	return readIntentDefault(path)
-}
-
-// TestHookSetReadIntentOverride runs one orchestration or CLI step.
-// Signature: TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()).
-// Why: enables deterministic intent-read failure injection without sleeping
-// goroutines that race slower CI agents.
-func TestHookSetReadIntentOverride(fn func(path string) (Intent, error)) (restore func()) {
-	testHookOverrideMu.Lock()
-	prev := readIntentImpl
-	if fn == nil {
-		readIntentImpl = readIntentDefault
-	} else {
-		readIntentImpl = fn
-	}
-	testHookOverrideMu.Unlock()
-	return func() {
-		testHookOverrideMu.Lock()
-		readIntentImpl = prev
-		testHookOverrideMu.Unlock()
-	}
-}
-
 // TestHookSetWriteIntentOverride runs one orchestration or CLI step.
 // Signature: TestHookSetWriteIntentOverride(fn func(path string, in Intent) error) (restore func()).
 // Why: enables deterministic intent-write failure injection from the top-level
--- a/scripts/install-artifacts.sh
+++ b/scripts/install-artifacts.sh
@ -1,116 +0,0 @@
-# Binary, config template, and systemd artifact helpers for the installer.
-
-resolve_build_target() {
-  if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
-    echo "./cmd/ananke"
-    return 0
-  fi
-  return 1
-}
-
-install_config_template() {
-  local template="$1"
-  local dest="$2"
-  local src legacy
-  local -a modern_candidates=()
-  local -a legacy_candidates=()
-
-  case "${template}" in
-    coordinator)
-      modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
-      legacy_candidates=("configs/hecate.titan-db.yaml")
-      ;;
-    peer)
-      modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
-      legacy_candidates=("configs/hecate.tethys.yaml")
-      ;;
-    example)
-      modern_candidates=("configs/ananke.example.yaml")
-      legacy_candidates=("configs/hecate.example.yaml")
-      ;;
-    *)
-      echo "[install] unknown config template key: ${template}" >&2
-      return 1
-      ;;
-  esac
-
-  for src in "${modern_candidates[@]}"; do
-    if [[ -f "${src}" ]]; then
-      install -m 0640 "${src}" "${dest}"
-      return 0
-    fi
-  done
-
-  for legacy in "${legacy_candidates[@]}"; do
-    if [[ -f "${legacy}" ]]; then
-      src="$(mktemp)"
-      legacy_path_rewrite "${legacy}" "${src}"
-      install -m 0640 "${src}" "${dest}"
-      rm -f "${src}"
-      return 0
-    fi
-  done
-
-  echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
-  return 1
-}
-
-install_systemd_units() {
-  local tmp
-
-  while IFS='|' read -r target_name modern_name legacy_name; do
-    local modern_src="deploy/systemd/${modern_name}"
-    local legacy_src="deploy/systemd/${legacy_name}"
-    local target="${SYSTEMD_DIR}/${target_name}"
-
-    if [[ -f "${modern_src}" ]]; then
-      install -m 0644 "${modern_src}" "${target}"
-      continue
-    fi
-
-    if [[ -f "${legacy_src}" ]]; then
-      tmp="$(mktemp)"
-      legacy_path_rewrite "${legacy_src}" "${tmp}"
-      install -m 0644 "${tmp}" "${target}"
-      rm -f "${tmp}"
-      continue
-    fi
-
-    echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
-    return 1
-  done <<'EOF_UNITS'
-ananke.service|ananke.service|hecate.service
-ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
-ananke-update.service|ananke-update.service|hecate-update.service
-ananke-update.timer|ananke-update.timer|hecate-update.timer
-EOF_UNITS
-}
-
-install_self_update_script() {
-  local modern_src="scripts/ananke-self-update.sh"
-  local legacy_src="scripts/hecate-self-update.sh"
-  local target="${LIB_DIR}/ananke-self-update.sh"
-  local tmp
-
-  if [[ -f "${modern_src}" ]]; then
-    install -m 0755 "${modern_src}" "${target}"
-    return 0
-  fi
-
-  if [[ -f "${legacy_src}" ]]; then
-    tmp="$(mktemp)"
-    legacy_path_rewrite "${legacy_src}" "${tmp}"
-    sed -Ei \
-      -e 's/HECATE_/ANANKE_/g' \
-      -e 's/hecate-self-update/ananke-self-update/g' \
-      -e 's#/opt/hecate#/opt/ananke#g' \
-      -e 's#bstein/hecate\.git#bstein/ananke.git#g' \
-      "${tmp}"
-    install -m 0755 "${tmp}" "${target}"
-    rm -f "${tmp}"
-    return 0
-  fi
-
-  echo "[install] missing both modern and legacy self-update scripts." >&2
-  return 1
-}
--- a/scripts/install-config-migration.sh
+++ b/scripts/install-config-migration.sh
@ -1,334 +0,0 @@
-# Config migration helpers for the Ananke host installer.
-
-read_ananke_role() {
-  if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
-    echo "coordinator"
-    return 0
-  fi
-  local role
-  role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
-  if [[ -z "${role}" ]]; then
-    role="coordinator"
-  fi
-  echo "${role}"
-}
-
-migration_yaml_lookup() {
-  local key="$1"
-  awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
-}
-
-first_control_plane_name() {
-  awk '
-    /^control_planes:[[:space:]]*$/ {in_list=1; next}
-    in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
-    in_list && /^[^[:space:]]/ {in_list=0}
-  ' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
-}
-
-lookup_node_host() {
-  local node="$1"
-  awk -F': *' -v n="${node}" '$1 == "  " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
-}
-
-migrate_ananke_config() {
-  if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
-    return 0
-  fi
-
-  local changed=0
-  local role_hint
-  role_hint="$(read_ananke_role)"
-  if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
-    echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
-    changed=1
-  fi
-  if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
-    echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
-    changed=1
-  fi
-  if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
-    && grep -Eq '^  titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei 's/^  titan-24:[[:space:]]*tethys[[:space:]]*$/  titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
-    echo "[install] migrated ssh_node_users titan-24 override to atlas"
-    changed=1
-  fi
-  if grep -Eq '^  command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
-    && ! grep -Eq '^  startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei '/^  command_timeout_seconds:[[:space:]]*[0-9]+/a\  startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
-    echo "[install] added coordination.startup_guard_max_age_seconds=900"
-    changed=1
-  fi
-  if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei \
-      -e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
-      -e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
-      -e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
-      -e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
-      "${CONF_DIR}/ananke.yaml"
-    echo "[install] removed deprecated host-poweroff shutdown config keys"
-    changed=1
-  fi
-  if grep -Eq '^  minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
-    && ! grep -Eq '^  require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei '/^  minimum_battery_percent:[[:space:]]*[0-9.]+/a\  require_node_inventory_reachability: true\n  node_inventory_reachability_wait_seconds: 300\n  node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
-    echo "[install] added startup node inventory reachability gate defaults"
-    changed=1
-  fi
-  if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
-    && ! grep -Eq '^  reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei '/^  dir:[[:space:]]*\/var\/lib\/ananke$/a\  reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
-    echo "[install] added state.reports_dir default"
-    changed=1
-  fi
-  if ! grep -Eq '^  peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
-    if [[ "${role_hint}" == "peer" ]] && grep -Eq '^  forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
-      local peer_host
-      peer_host="$(awk -F': *' '/^  forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
-      if [[ -n "${peer_host}" ]]; then
-        sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts:\n    - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
-        echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
-        changed=1
-      fi
-    elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^  titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
-      sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts:\n    - titan-24' "${CONF_DIR}/ananke.yaml"
-      echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
-      changed=1
-    else
-      sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts: []' "${CONF_DIR}/ananke.yaml"
-      echo "[install] added coordination.peer_hosts empty default"
-      changed=1
-    fi
-  fi
-  local default_restore_cp
-  default_restore_cp="$(first_control_plane_name)"
-  if [[ -z "${default_restore_cp}" ]]; then
-    default_restore_cp="titan-0a"
-  fi
-  if grep -Eq '^  api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
-    && ! grep -Eq '^  auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei '/^  api_poll_seconds:[[:space:]]*[0-9]+/a\  require_time_sync: true\n  time_sync_wait_seconds: 240\n  time_sync_poll_seconds: 5\n  reconcile_access_on_boot: true\n  auto_etcd_restore_on_api_failure: true\n  etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
-    echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
-    changed=1
-  fi
-  if grep -Eq '^  api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
-    && ! grep -Eq '^  require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei '/^  api_poll_seconds:[[:space:]]*[0-9]+/a\  require_time_sync: true\n  time_sync_wait_seconds: 240\n  time_sync_poll_seconds: 5\n  reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
-    echo "[install] added startup time sync + access reconciliation defaults"
-    changed=1
-  fi
-  if grep -Eq '^  time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
-    && ! grep -Eq '^  time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei '/^  time_sync_poll_seconds:[[:space:]]*[0-9]+/a\  time_sync_mode: quorum\n  time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
-    echo "[install] added startup time sync quorum defaults"
-    changed=1
-  fi
-  if grep -Eq '^  etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
-    && ! grep -Eq '^  require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei '/^  etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\  require_storage_ready: true\n  storage_ready_wait_seconds: 420\n  storage_ready_poll_seconds: 5\n  storage_min_ready_nodes: 2\n  storage_critical_pvcs:\n    - vault/data-vault-0\n    - postgres/postgres-data-postgres-0\n    - gitea/gitea-data\n    - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
-    echo "[install] added startup storage readiness defaults"
-    changed=1
-  fi
-  if grep -Eq '^  storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
-    && ! grep -Eq '^  require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei '/^    - sso\/keycloak-data$/a\  require_post_start_probes: true\n  post_start_probe_wait_seconds: 240\n  post_start_probe_poll_seconds: 5\n  post_start_probes:\n    - https://scm.bstein.dev/api/healthz\n    - https://metrics.bstein.dev/api/health\n  require_service_checklist: true\n  service_checklist_wait_seconds: 420\n  service_checklist_poll_seconds: 5\n  service_checklist_stability_seconds: 120\n  service_checklist:\n    - name: gitea-api\n      url: https://scm.bstein.dev/api/healthz\n      accepted_statuses: [200]\n      body_contains: pass\n      timeout_seconds: 12\n    - name: grafana-api\n      url: https://metrics.bstein.dev/api/health\n      accepted_statuses: [200]\n      body_contains: '\''\"database\":\"ok\"'\''\n      timeout_seconds: 12\n  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
-    echo "[install] added startup post-start probe + vault key fallback defaults"
-    changed=1
-  fi
-  if grep -Eq '^    - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
-    sed -Ei '/^    - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
-    echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
-    changed=1
-  fi
-  if ! grep -Eq '^  vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
-    if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^  post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
-      sed -Ei '/^    - https:\/\/metrics\.bstein\.dev\/api\/health$/a\  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
-      echo "[install] added startup.vault_unseal_key_file default"
-      changed=1
-    fi
-  fi
-  if ! grep -Eq '^  vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
-    if grep -Eq '^  vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
-      sed -Ei '/^  vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\  vault_unseal_breakglass_command: ""\n  vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
-      echo "[install] added startup break-glass fallback defaults"
-      changed=1
-    fi
-  fi
-
-  install_cluster_inventory_defaults "${role_hint}" && changed=1
-
-  if [[ "${changed}" -eq 1 ]]; then
-    chmod 0640 "${CONF_DIR}/ananke.yaml" || true
-  fi
-}
-
-install_cluster_inventory_defaults() {
-  local role="$1"
-  local changed=0
-  local inventory_block=""
-  local managed_block=""
-  local workers_block
-  workers_block='workers:
-  - titan-04
-  - titan-05
-  - titan-06
-  - titan-07
-  - titan-08
-  - titan-09
-  - titan-10
-  - titan-11
-  - titan-12
-  - titan-13
-  - titan-14
-  - titan-15
-  - titan-17
-  - titan-18
-  - titan-19
-  - titan-20
-  - titan-21
-  - titan-22
-  - titan-24'
-
-  if [[ "${role}" == "coordinator" || "${role}" == "peer" ]]; then
-    inventory_block='ssh_node_hosts:
-  titan-db: 192.168.22.10
-  titan-0a: 192.168.22.11
-  titan-0b: 192.168.22.12
-  titan-0c: 192.168.22.13
-  titan-04: 192.168.22.30
-  titan-05: 192.168.22.31
-  titan-06: 192.168.22.32
-  titan-07: 192.168.22.33
-  titan-08: 192.168.22.34
-  titan-09: 192.168.22.35
-  titan-10: 192.168.22.36
-  titan-11: 192.168.22.37
-  titan-12: 192.168.22.40
-  titan-13: 192.168.22.41
-  titan-14: 192.168.22.42
-  titan-15: 192.168.22.43
-  titan-17: 192.168.22.45
-  titan-18: 192.168.22.46
-  titan-19: 192.168.22.47
-  titan-20: 192.168.22.20
-  titan-21: 192.168.22.21
-  titan-22: 192.168.22.22
-  titan-24: 192.168.22.26'
-    managed_block='ssh_managed_nodes:
-  - titan-db
-  - titan-0a
-  - titan-0b
-  - titan-0c
-  - titan-04
-  - titan-05
-  - titan-06
-  - titan-07
-  - titan-08
-  - titan-09
-  - titan-10
-  - titan-11
-  - titan-12
-  - titan-13
-  - titan-14
-  - titan-15
-  - titan-17
-  - titan-18
-  - titan-19
-  - titan-20
-  - titan-21
-  - titan-22
-  - titan-24'
-  fi
-
-  if [[ -n "${inventory_block}" ]] && grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
-    perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
-    echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
-    changed=1
-  fi
-  if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
-    perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
-    echo "[install] hydrated workers inventory for startup/shutdown orchestration"
-    changed=1
-  fi
-
-  if [[ -n "${managed_block}" ]]; then
-    if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
-      perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
-      echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
-      changed=1
-    fi
-    if ! grep -Eq '^  - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - titan-21$' "${CONF_DIR}/ananke.yaml"; then
-      perl -0pi -e 's#ssh_managed_nodes:\n(?:  - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
-      echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
-      changed=1
-    fi
-  fi
-
-  if [[ "${role}" == "peer" ]]; then
-    install_peer_inventory_defaults && changed=1
-  fi
-
-  [[ "${changed}" -eq 1 ]]
-}
-
-install_peer_inventory_defaults() {
-  local changed=0
-  if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
-    && grep -Eq '^  - titan-db$' "${CONF_DIR}/ananke.yaml" \
-    && grep -Eq '^  - titan-24$' "${CONF_DIR}/ananke.yaml" \
-    && ! grep -Eq '^  - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
-    perl -0pi -e 's#ssh_managed_nodes:\n  - titan-db\n  - titan-24\n#ssh_managed_nodes:\n  - titan-db\n  - titan-0a\n  - titan-0b\n  - titan-0c\n  - titan-04\n  - titan-05\n  - titan-06\n  - titan-07\n  - titan-08\n  - titan-09\n  - titan-10\n  - titan-11\n  - titan-12\n  - titan-13\n  - titan-14\n  - titan-15\n  - titan-17\n  - titan-18\n  - titan-19\n  - titan-20\n  - titan-21\n  - titan-22\n  - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
-    echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
-    changed=1
-  fi
-
-  if ! grep -Eq '^  - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
-    perl -0pi -e 's#local_bootstrap_paths:\n(?:  - [^\n]*\n)*#local_bootstrap_paths:\n  - infrastructure/core\n  - clusters/atlas/flux-system\n  - infrastructure/sources/helm\n  - infrastructure/metallb\n  - infrastructure/traefik\n  - infrastructure/cert-manager\n  - infrastructure/vault-csi\n  - infrastructure/vault-injector\n  - services/vault\n  - infrastructure/postgres\n  - services/gitea\n  - services/keycloak\n  - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
-    echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
-    changed=1
-  fi
-  [[ "${changed}" -eq 1 ]]
-}
-
-sanitize_migrated_ananke_config() {
-  local cfg="${CONF_DIR}/ananke.yaml"
-  [[ -f "${cfg}" ]] || return 0
-
-  local tmp changed=0
-  tmp="$(mktemp)"
-
-  # If a legacy migration bug appended root-level node entries after
-  # ssh_managed_nodes, drop those orphan entries until the next top-level key.
-  awk '
-    BEGIN {in_managed=0}
-    /^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
-    {
-      if (in_managed) {
-        if ($0 ~ /^  - /) {print; next}
-        if ($0 ~ /^- /) {next}
-        if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
-      }
-      print
-    }
-  ' "${cfg}" > "${tmp}"
-
-  if ! cmp -s "${cfg}" "${tmp}"; then
-    mv "${tmp}" "${cfg}"
-    changed=1
-    echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
-  else
-    rm -f "${tmp}"
-  fi
-
-  if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
-    sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
-    changed=1
-    echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
-  fi
-
-  if [[ "${changed}" -eq 1 ]]; then
-    chmod 0640 "${cfg}" || true
-  fi
-}
--- a/scripts/install-host-bootstrap.sh
+++ b/scripts/install-host-bootstrap.sh
@ -1,239 +0,0 @@
-# Host bootstrap helpers for the Ananke installer.
-
-resolve_nut_ups_name() {
-  if [[ -n "${NUT_UPS_NAME}" ]]; then
-    return 0
-  fi
-
-  if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
-    local target=""
-    target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
-    if [[ -n "${target}" ]]; then
-      NUT_UPS_NAME="${target%@localhost}"
-      echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
-      return 0
-    fi
-  fi
-
-  NUT_UPS_NAME="pyrphoros"
-  echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
-}
-
-ensure_ananke_kubeconfig() {
-  local kubeconfig_path
-  kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
-  if [[ -z "${kubeconfig_path}" ]]; then
-    kubeconfig_path="/etc/ananke/kubeconfig"
-  fi
-  install -d -m 0750 "$(dirname "${kubeconfig_path}")"
-
-  if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
-    return 0
-  fi
-
-  if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
-    install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
-    echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
-    if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
-      return 0
-    fi
-  fi
-
-  local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
-  cp_name="$(first_control_plane_name)"
-  if [[ -z "${cp_name}" ]]; then
-    echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
-    return 0
-  fi
-  cp_host="$(lookup_node_host "${cp_name}")"
-  if [[ -z "${cp_host}" ]]; then
-    cp_host="${cp_name}"
-  fi
-  ssh_user="$(migration_yaml_lookup "ssh_user")"
-  ssh_port="$(migration_yaml_lookup "ssh_port")"
-  ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
-  ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
-  if [[ -z "${ssh_port}" ]]; then
-    ssh_port="2277"
-  fi
-
-  local target
-  target="${cp_host}"
-  if [[ -n "${ssh_user}" ]]; then
-    target="${ssh_user}@${cp_host}"
-  fi
-  local ssh_args=(
-    -o BatchMode=yes
-    -o ConnectTimeout=8
-    -o StrictHostKeyChecking=accept-new
-  )
-  if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
-    ssh_args+=(-F "${ssh_cfg}")
-  fi
-  if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
-    ssh_args+=(-i "${ssh_key}")
-  fi
-  if [[ -n "${ssh_port}" ]]; then
-    ssh_args+=(-p "${ssh_port}")
-  fi
-
-  local remote_cfg
-  if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
-    printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
-    sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
-    chmod 0600 "${kubeconfig_path}"
-    echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
-    if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
-      return 0
-    fi
-  else
-    echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
-  fi
-
-  echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
-}
-
-ensure_ananke_ssh_identity() {
-  local key_path key_dir key_user key_comment
-  key_path="$(migration_yaml_lookup "ssh_identity_file")"
-  if [[ -z "${key_path}" ]]; then
-    key_path="/home/atlas/.ssh/id_ed25519"
-  fi
-  key_dir="$(dirname "${key_path}")"
-  key_comment="ananke-$(hostname)-forward"
-
-  key_user="root"
-  if [[ "${key_path}" == /home/*/* ]]; then
-    key_user="${key_path#/home/}"
-    key_user="${key_user%%/*}"
-  fi
-
-  if ! id "${key_user}" >/dev/null 2>&1; then
-    echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
-    return 0
-  fi
-
-  install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
-  if [[ ! -s "${key_path}" ]]; then
-    echo "[install] generating missing SSH identity at ${key_path}"
-    if [[ "${key_user}" == "root" ]]; then
-      ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
-    else
-      runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
-    fi
-  fi
-  chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
-  chmod 0600 "${key_path}" || true
-  chmod 0644 "${key_path}.pub" || true
-}
-
-ensure_apt_packages() {
-  local missing=()
-  for pkg in "$@"; do
-    if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
-      missing+=("${pkg}")
-    fi
-  done
-  if [[ ${#missing[@]} -eq 0 ]]; then
-    return 0
-  fi
-  echo "[install] apt install: ${missing[*]}"
-  export DEBIAN_FRONTEND=noninteractive
-  apt-get update -y
-  apt-get install -y "${missing[@]}"
-}
-
-install_kubectl_if_missing() {
-  if command -v kubectl >/dev/null 2>&1; then
-    return 0
-  fi
-  ensure_apt_packages kubernetes-client || true
-  if command -v kubectl >/dev/null 2>&1; then
-    return 0
-  fi
-  echo "[install] installing kubectl via upstream binary"
-  local arch
-  arch="$(uname -m)"
-  case "${arch}" in
-    x86_64) arch="amd64" ;;
-    aarch64|arm64) arch="arm64" ;;
-    *) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
-  esac
-  local version
-  version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
-  curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
-  chmod 0755 /usr/local/bin/kubectl
-}
-
-ensure_dependencies() {
-  if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
-    echo "[install] skipping dependency installation"
-    return 0
-  fi
-  if ! command -v apt-get >/dev/null 2>&1; then
-    echo "This installer currently supports apt-based hosts only." >&2
-    exit 1
-  fi
-  ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
-  install_kubectl_if_missing
-}
-
-configure_nut() {
-  if [[ "${MANAGE_NUT}" != "1" ]]; then
-    echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
-    return 0
-  fi
-
-  echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
-  install -d -m 0755 /etc/nut /etc/udev/rules.d
-
-  cat > /etc/nut/nut.conf <<EOF
-MODE=standalone
-EOF
-
-  cat > /etc/nut/ups.conf <<EOF
-[${NUT_UPS_NAME}]
-  driver = usbhid-ups
-  port = auto
-  vendorid = ${NUT_VENDOR_ID}
-  productid = ${NUT_PRODUCT_ID}
-  pollinterval = 5
-EOF
-
-  cat > /etc/nut/upsd.users <<EOF
-[${NUT_MONITOR_USER}]
-  password = ${NUT_MONITOR_PASSWORD}
-  upsmon primary
-EOF
-  chmod 0640 /etc/nut/upsd.users
-  if getent group nut >/dev/null 2>&1; then
-    chown root:nut /etc/nut/upsd.users
-  else
-    chown root:root /etc/nut/upsd.users
-  fi
-
-  cat > /etc/nut/upsmon.conf <<EOF
-RUN_AS_USER nut
-MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
-MINSUPPLIES 1
-SHUTDOWNCMD "/sbin/shutdown -h +0"
-POLLFREQ 5
-POLLFREQALERT 5
-HOSTSYNC 15
-DEADTIME 15
-POWERDOWNFLAG /etc/killpower
-EOF
-
-  cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
-# Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
-ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
-EOF
-
-  udevadm control --reload-rules || true
-  udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
-
-  systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
-  systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
-  systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
-  systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
-}
--- a/scripts/install-legacy-migration.sh
+++ b/scripts/install-legacy-migration.sh
@ -1,98 +0,0 @@
-# Legacy Hecate migration helpers for the Ananke installer.
-
-legacy_path_rewrite() {
-  local src="$1"
-  local dst="$2"
-  sed \
-    -e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
-    -e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
-    -e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
-    -e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
-    -e 's#/opt/hecate#/opt/ananke#g' \
-    -e 's#/etc/hecate#/etc/ananke#g' \
-    -e 's#/var/lib/hecate#/var/lib/ananke#g' \
-    -e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
-    -e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
-    -e 's/hecate.yaml/ananke.yaml/g' \
-    -e 's/hecate.lock/ananke.lock/g' \
-    -e 's/hecate/ananke/g' \
-    -e 's/Hecate/Ananke/g' \
-    -e 's#hecate\.lock#ananke.lock#g' \
-    "${src}" > "${dst}"
-}
-
-migrate_legacy_hecate_install() {
-  local legacy_conf_dir="/etc/hecate"
-  local legacy_state_dir="/var/lib/hecate"
-  local legacy_systemd_dir="/etc/systemd/system"
-
-  install -d -m 0750 "${CONF_DIR}"
-  install -d -m 0750 "${STATE_DIR}"
-
-  if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
-    echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
-    legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
-    chmod 0640 "${CONF_DIR}/ananke.yaml"
-  fi
-
-  if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
-    echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
-    install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
-  fi
-
-  if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
-    echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
-    install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
-  fi
-
-  if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
-    echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
-    install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
-  fi
-
-  if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
-    echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
-    install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
-  fi
-
-  if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
-    echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
-    install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
-  fi
-
-  if [[ -d "${legacy_systemd_dir}" ]]; then
-    if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
-      echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
-    fi
-  fi
-}
-
-retire_legacy_hecate_install() {
-  local ts backup_dir
-  ts="$(date +%Y%m%d%H%M%S)"
-  backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
-
-  systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
-  systemctl stop hecate-update.service >/dev/null 2>&1 || true
-
-  if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
-    install -d -m 0750 "${backup_dir}"
-    [[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
-    [[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
-    [[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
-    [[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
-    [[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
-    echo "[install] backed up legacy hecate assets to ${backup_dir}"
-  fi
-
-  rm -f \
-    /etc/systemd/system/hecate.service \
-    /etc/systemd/system/hecate-bootstrap.service \
-    /etc/systemd/system/hecate-update.service \
-    /etc/systemd/system/hecate-update.timer
-  rm -f /usr/local/bin/hecate
-  rm -rf /usr/local/lib/hecate
-  rm -rf /opt/hecate
-  rm -rf /etc/hecate
-  rm -rf /var/lib/hecate
-}
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -41,10 +41,829 @@ while [[ $# -gt 0 ]]; do
  esac
 done

-source "${REPO_DIR}/scripts/install-config-migration.sh"
-source "${REPO_DIR}/scripts/install-host-bootstrap.sh"
-source "${REPO_DIR}/scripts/install-legacy-migration.sh"
-source "${REPO_DIR}/scripts/install-artifacts.sh"
+resolve_nut_ups_name() {
+  if [[ -n "${NUT_UPS_NAME}" ]]; then
+    return 0
+  fi
+
+  if [[ -f "${CONF_DIR}/ananke.yaml" ]]; then
+    local target=""
+    target="$(grep -Eo 'target:[[:space:]]*[A-Za-z0-9._-]+@localhost' "${CONF_DIR}/ananke.yaml" | head -n 1 | awk '{print $2}')"
+    if [[ -n "${target}" ]]; then
+      NUT_UPS_NAME="${target%@localhost}"
+      echo "[install] inferred NUT UPS name from config: ${NUT_UPS_NAME}"
+      return 0
+    fi
+  fi
+
+  NUT_UPS_NAME="pyrphoros"
+  echo "[install] defaulting NUT UPS name to ${NUT_UPS_NAME}"
+}
+
+read_ananke_role() {
+  if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
+    echo "coordinator"
+    return 0
+  fi
+  local role
+  role="$(awk '/^[[:space:]]*role:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
+  if [[ -z "${role}" ]]; then
+    role="coordinator"
+  fi
+  echo "${role}"
+}
+
+migration_yaml_lookup() {
+  local key="$1"
+  awk -F': *' -v k="${key}" '$1 == k {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
+}
+
+first_control_plane_name() {
+  awk '
+    /^control_planes:[[:space:]]*$/ {in_list=1; next}
+    in_list && /^[[:space:]]*-[[:space:]]*/ {gsub(/^[[:space:]]*-[[:space:]]*/, "", $0); print $0; exit}
+    in_list && /^[^[:space:]]/ {in_list=0}
+  ' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
+}
+
+lookup_node_host() {
+  local node="$1"
+  awk -F': *' -v n="${node}" '$1 == "  " n {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true
+}
+
+ensure_ananke_kubeconfig() {
+  local kubeconfig_path
+  kubeconfig_path="$(migration_yaml_lookup "kubeconfig")"
+  if [[ -z "${kubeconfig_path}" ]]; then
+    kubeconfig_path="/etc/ananke/kubeconfig"
+  fi
+  install -d -m 0750 "$(dirname "${kubeconfig_path}")"
+
+  if [[ -s "${kubeconfig_path}" ]] && KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
+    return 0
+  fi
+
+  if [[ -r /etc/rancher/k3s/k3s.yaml ]]; then
+    install -m 0600 /etc/rancher/k3s/k3s.yaml "${kubeconfig_path}"
+    echo "[install] refreshed kubeconfig from local /etc/rancher/k3s/k3s.yaml"
+    if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
+      return 0
+    fi
+  fi
+
+  local cp_name cp_host ssh_user ssh_port ssh_cfg ssh_key
+  cp_name="$(first_control_plane_name)"
+  if [[ -z "${cp_name}" ]]; then
+    echo "[install] warning: cannot infer control plane name; kubeconfig bootstrap skipped"
+    return 0
+  fi
+  cp_host="$(lookup_node_host "${cp_name}")"
+  if [[ -z "${cp_host}" ]]; then
+    cp_host="${cp_name}"
+  fi
+  ssh_user="$(migration_yaml_lookup "ssh_user")"
+  ssh_port="$(migration_yaml_lookup "ssh_port")"
+  ssh_cfg="$(migration_yaml_lookup "ssh_config_file")"
+  ssh_key="$(migration_yaml_lookup "ssh_identity_file")"
+  if [[ -z "${ssh_port}" ]]; then
+    ssh_port="2277"
+  fi
+
+  local target
+  target="${cp_host}"
+  if [[ -n "${ssh_user}" ]]; then
+    target="${ssh_user}@${cp_host}"
+  fi
+  local ssh_args=(
+    -o BatchMode=yes
+    -o ConnectTimeout=8
+    -o StrictHostKeyChecking=accept-new
+  )
+  if [[ -n "${ssh_cfg}" && -f "${ssh_cfg}" ]]; then
+    ssh_args+=(-F "${ssh_cfg}")
+  fi
+  if [[ -n "${ssh_key}" && -f "${ssh_key}" ]]; then
+    ssh_args+=(-i "${ssh_key}")
+  fi
+  if [[ -n "${ssh_port}" ]]; then
+    ssh_args+=(-p "${ssh_port}")
+  fi
+
+  local remote_cfg
+  if remote_cfg="$(ssh "${ssh_args[@]}" "${target}" "sudo cat /etc/rancher/k3s/k3s.yaml" 2>/dev/null)"; then
+    printf '%s\n' "${remote_cfg}" > "${kubeconfig_path}"
+    sed -Ei "s#server:[[:space:]]*https://127\\.0\\.0\\.1:6443#server: https://${cp_host}:6443#g" "${kubeconfig_path}" || true
+    chmod 0600 "${kubeconfig_path}"
+    echo "[install] bootstrapped kubeconfig from control plane ${cp_name} (${cp_host})"
+    if KUBECONFIG="${kubeconfig_path}" kubectl version --request-timeout=5s >/dev/null 2>&1; then
+      return 0
+    fi
+  else
+    echo "[install] warning: failed to fetch kubeconfig from ${cp_name} (${cp_host})"
+  fi
+
+  echo "[install] warning: kubeconfig at ${kubeconfig_path} is still not validated; local startup fallback may fail"
+}
+
+ensure_ananke_ssh_identity() {
+  local key_path key_dir key_user key_comment
+  key_path="$(migration_yaml_lookup "ssh_identity_file")"
+  if [[ -z "${key_path}" ]]; then
+    key_path="/home/atlas/.ssh/id_ed25519"
+  fi
+  key_dir="$(dirname "${key_path}")"
+  key_comment="ananke-$(hostname)-forward"
+
+  key_user="root"
+  if [[ "${key_path}" == /home/*/* ]]; then
+    key_user="${key_path#/home/}"
+    key_user="${key_user%%/*}"
+  fi
+
+  if ! id "${key_user}" >/dev/null 2>&1; then
+    echo "[install] warning: ssh identity owner ${key_user} does not exist; skipping key bootstrap for ${key_path}"
+    return 0
+  fi
+
+  install -d -m 0700 -o "${key_user}" -g "${key_user}" "${key_dir}"
+  if [[ ! -s "${key_path}" ]]; then
+    echo "[install] generating missing SSH identity at ${key_path}"
+    if [[ "${key_user}" == "root" ]]; then
+      ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
+    else
+      runuser -u "${key_user}" -- ssh-keygen -q -t ed25519 -N '' -C "${key_comment}" -f "${key_path}"
+    fi
+  fi
+  chown "${key_user}:${key_user}" "${key_path}" "${key_path}.pub" 2>/dev/null || true
+  chmod 0600 "${key_path}" || true
+  chmod 0644 "${key_path}.pub" || true
+}
+
+migrate_ananke_config() {
+  if [[ ! -f "${CONF_DIR}/ananke.yaml" ]]; then
+    return 0
+  fi
+
+  local changed=0
+  local role_hint
+  role_hint="$(read_ananke_role)"
+  if grep -Eq 'default_budget_seconds:[[:space:]]*300' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei 's/(default_budget_seconds:[[:space:]]*)300/\11380/' "${CONF_DIR}/ananke.yaml"
+    echo "[install] migrated default_budget_seconds 300 -> 1380 in ${CONF_DIR}/ananke.yaml"
+    changed=1
+  fi
+  if grep -Eq 'runtime_safety_factor:[[:space:]]*1\.10' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei 's/(runtime_safety_factor:[[:space:]]*)1\.10/\11.25/' "${CONF_DIR}/ananke.yaml"
+    echo "[install] migrated runtime_safety_factor 1.10 -> 1.25 in ${CONF_DIR}/ananke.yaml"
+    changed=1
+  fi
+  if grep -Eq '^ssh_node_users:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
+    && grep -Eq '^  titan-24:[[:space:]]*tethys[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei 's/^  titan-24:[[:space:]]*tethys[[:space:]]*$/  titan-24: atlas/' "${CONF_DIR}/ananke.yaml"
+    echo "[install] migrated ssh_node_users titan-24 override to atlas"
+    changed=1
+  fi
+  if grep -Eq '^  command_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
+    && ! grep -Eq '^  startup_guard_max_age_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei '/^  command_timeout_seconds:[[:space:]]*[0-9]+/a\  startup_guard_max_age_seconds: 900' "${CONF_DIR}/ananke.yaml"
+    echo "[install] added coordination.startup_guard_max_age_seconds=900"
+    changed=1
+  fi
+  if grep -Eq '^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei \
+      -e '/^[[:space:]]*poweroff_enabled:[[:space:]]*(true|false)/d' \
+      -e '/^[[:space:]]*poweroff_delay_seconds:[[:space:]]*[0-9]+/d' \
+      -e '/^[[:space:]]*poweroff_local_host:[[:space:]]*(true|false)/d' \
+      -e '/^[[:space:]]*extra_poweroff_hosts:[[:space:]]*(\[\])?[[:space:]]*$/d' \
+      "${CONF_DIR}/ananke.yaml"
+    echo "[install] removed deprecated host-poweroff shutdown config keys"
+    changed=1
+  fi
+  if grep -Eq '^  minimum_battery_percent:[[:space:]]*[0-9.]+' "${CONF_DIR}/ananke.yaml" \
+    && ! grep -Eq '^  require_node_inventory_reachability:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei '/^  minimum_battery_percent:[[:space:]]*[0-9.]+/a\  require_node_inventory_reachability: true\n  node_inventory_reachability_wait_seconds: 300\n  node_inventory_reachability_poll_seconds: 5' "${CONF_DIR}/ananke.yaml"
+    echo "[install] added startup node inventory reachability gate defaults"
+    changed=1
+  fi
+  if grep -Eq '^state:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
+    && ! grep -Eq '^  reports_dir:[[:space:]]*/var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei '/^  dir:[[:space:]]*\/var\/lib\/ananke$/a\  reports_dir: /var/lib/ananke/reports' "${CONF_DIR}/ananke.yaml"
+    echo "[install] added state.reports_dir default"
+    changed=1
+  fi
+  if ! grep -Eq '^  peer_hosts:' "${CONF_DIR}/ananke.yaml"; then
+    if [[ "${role_hint}" == "peer" ]] && grep -Eq '^  forward_shutdown_host:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml"; then
+      local peer_host
+      peer_host="$(awk -F': *' '/^  forward_shutdown_host:[[:space:]]*/ {print $2; exit}' "${CONF_DIR}/ananke.yaml" 2>/dev/null || true)"
+      if [[ -n "${peer_host}" ]]; then
+        sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts:\n    - '"${peer_host}"'' "${CONF_DIR}/ananke.yaml"
+        echo "[install] added coordination.peer_hosts from forward_shutdown_host (${peer_host})"
+        changed=1
+      fi
+    elif [[ "${role_hint}" == "coordinator" ]] && grep -Eq '^  titan-24:[[:space:]]*[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "${CONF_DIR}/ananke.yaml"; then
+      sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts:\n    - titan-24' "${CONF_DIR}/ananke.yaml"
+      echo "[install] added coordination.peer_hosts default (titan-24) for coordinator role"
+      changed=1
+    else
+      sed -Ei '/^  forward_shutdown_config:[[:space:]]*.*$/a\  peer_hosts: []' "${CONF_DIR}/ananke.yaml"
+      echo "[install] added coordination.peer_hosts empty default"
+      changed=1
+    fi
+  fi
+  local default_restore_cp
+  default_restore_cp="$(first_control_plane_name)"
+  if [[ -z "${default_restore_cp}" ]]; then
+    default_restore_cp="titan-0a"
+  fi
+  if grep -Eq '^  api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
+    && ! grep -Eq '^  auto_etcd_restore_on_api_failure:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei '/^  api_poll_seconds:[[:space:]]*[0-9]+/a\  require_time_sync: true\n  time_sync_wait_seconds: 240\n  time_sync_poll_seconds: 5\n  reconcile_access_on_boot: true\n  auto_etcd_restore_on_api_failure: true\n  etcd_restore_control_plane: '"${default_restore_cp}"'' "${CONF_DIR}/ananke.yaml"
+    echo "[install] added startup.auto_etcd_restore_on_api_failure + startup.etcd_restore_control_plane defaults"
+    changed=1
+  fi
+  if grep -Eq '^  api_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
+    && ! grep -Eq '^  require_time_sync:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei '/^  api_poll_seconds:[[:space:]]*[0-9]+/a\  require_time_sync: true\n  time_sync_wait_seconds: 240\n  time_sync_poll_seconds: 5\n  reconcile_access_on_boot: true' "${CONF_DIR}/ananke.yaml"
+    echo "[install] added startup time sync + access reconciliation defaults"
+    changed=1
+  fi
+  if grep -Eq '^  time_sync_poll_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml" \
+    && ! grep -Eq '^  time_sync_mode:[[:space:]]*(strict|quorum)' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei '/^  time_sync_poll_seconds:[[:space:]]*[0-9]+/a\  time_sync_mode: quorum\n  time_sync_quorum: 2' "${CONF_DIR}/ananke.yaml"
+    echo "[install] added startup time sync quorum defaults"
+    changed=1
+  fi
+  if grep -Eq '^  etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+' "${CONF_DIR}/ananke.yaml" \
+    && ! grep -Eq '^  require_storage_ready:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei '/^  etcd_restore_control_plane:[[:space:]]*[A-Za-z0-9._-]+/a\  require_storage_ready: true\n  storage_ready_wait_seconds: 420\n  storage_ready_poll_seconds: 5\n  storage_min_ready_nodes: 2\n  storage_critical_pvcs:\n    - vault/data-vault-0\n    - postgres/postgres-data-postgres-0\n    - gitea/gitea-data\n    - sso/keycloak-data' "${CONF_DIR}/ananke.yaml"
+    echo "[install] added startup storage readiness defaults"
+    changed=1
+  fi
+  if grep -Eq '^  storage_critical_pvcs:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
+    && ! grep -Eq '^  require_post_start_probes:[[:space:]]*(true|false)' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei '/^    - sso\/keycloak-data$/a\  require_post_start_probes: true\n  post_start_probe_wait_seconds: 240\n  post_start_probe_poll_seconds: 5\n  post_start_probes:\n    - https://scm.bstein.dev/api/healthz\n    - https://metrics.bstein.dev/api/health\n  require_service_checklist: true\n  service_checklist_wait_seconds: 420\n  service_checklist_poll_seconds: 5\n  service_checklist_stability_seconds: 120\n  service_checklist:\n    - name: gitea-api\n      url: https://scm.bstein.dev/api/healthz\n      accepted_statuses: [200]\n      body_contains: pass\n      timeout_seconds: 12\n    - name: grafana-api\n      url: https://metrics.bstein.dev/api/health\n      accepted_statuses: [200]\n      body_contains: '\''\"database\":\"ok\"'\''\n      timeout_seconds: 12\n  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
+    echo "[install] added startup post-start probe + vault key fallback defaults"
+    changed=1
+  fi
+  if grep -Eq '^    - https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration$' "${CONF_DIR}/ananke.yaml"; then
+    sed -Ei '/^    - https:\/\/sso\.bstein\.dev\/realms\/atlas\/\.well-known\/openid-configuration$/d' "${CONF_DIR}/ananke.yaml"
+    echo "[install] removed sso OIDC probe from startup.post_start_probes (returns 404 in current deployment)"
+    changed=1
+  fi
+  if ! grep -Eq '^  vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
+    if grep -Eq '^startup:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" && grep -Eq '^  post_start_probes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
+      sed -Ei '/^    - https:\/\/metrics\.bstein\.dev\/api\/health$/a\  vault_unseal_key_file: /var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"
+      echo "[install] added startup.vault_unseal_key_file default"
+      changed=1
+    fi
+  fi
+  if ! grep -Eq '^  vault_unseal_breakglass_timeout_seconds:[[:space:]]*[0-9]+' "${CONF_DIR}/ananke.yaml"; then
+    if grep -Eq '^  vault_unseal_key_file:[[:space:]]*/var/lib/ananke/vault-unseal.key' "${CONF_DIR}/ananke.yaml"; then
+      sed -Ei '/^  vault_unseal_key_file:[[:space:]]*\/var\/lib\/ananke\/vault-unseal.key$/a\  vault_unseal_breakglass_command: ""\n  vault_unseal_breakglass_timeout_seconds: 15' "${CONF_DIR}/ananke.yaml"
+      echo "[install] added startup break-glass fallback defaults"
+      changed=1
+    fi
+  fi
+
+  local role
+  role="$(read_ananke_role)"
+  local inventory_block
+  local managed_block
+  local workers_block
+  workers_block='workers:
+  - titan-04
+  - titan-05
+  - titan-06
+  - titan-07
+  - titan-08
+  - titan-09
+  - titan-10
+  - titan-11
+  - titan-12
+  - titan-13
+  - titan-14
+  - titan-15
+  - titan-17
+  - titan-18
+  - titan-19
+  - titan-20
+  - titan-21
+  - titan-22
+  - titan-24'
+  if [[ "${role}" == "coordinator" ]]; then
+    inventory_block='ssh_node_hosts:
+  titan-db: 192.168.22.10
+  titan-0a: 192.168.22.11
+  titan-0b: 192.168.22.12
+  titan-0c: 192.168.22.13
+  titan-04: 192.168.22.30
+  titan-05: 192.168.22.31
+  titan-06: 192.168.22.32
+  titan-07: 192.168.22.33
+  titan-08: 192.168.22.34
+  titan-09: 192.168.22.35
+  titan-10: 192.168.22.36
+  titan-11: 192.168.22.37
+  titan-12: 192.168.22.40
+  titan-13: 192.168.22.41
+  titan-14: 192.168.22.42
+  titan-15: 192.168.22.43
+  titan-17: 192.168.22.45
+  titan-18: 192.168.22.46
+  titan-19: 192.168.22.47
+  titan-20: 192.168.22.20
+  titan-21: 192.168.22.21
+  titan-22: 192.168.22.22
+  titan-24: 192.168.22.26'
+    managed_block='ssh_managed_nodes:
+  - titan-db
+  - titan-0a
+  - titan-0b
+  - titan-0c
+  - titan-04
+  - titan-05
+  - titan-06
+  - titan-07
+  - titan-08
+  - titan-09
+  - titan-10
+  - titan-11
+  - titan-12
+  - titan-13
+  - titan-14
+  - titan-15
+  - titan-17
+  - titan-18
+  - titan-19
+  - titan-20
+  - titan-21
+  - titan-22
+  - titan-24'
+  elif [[ "${role}" == "peer" ]]; then
+    inventory_block='ssh_node_hosts:
+  titan-db: 192.168.22.10
+  titan-0a: 192.168.22.11
+  titan-0b: 192.168.22.12
+  titan-0c: 192.168.22.13
+  titan-04: 192.168.22.30
+  titan-05: 192.168.22.31
+  titan-06: 192.168.22.32
+  titan-07: 192.168.22.33
+  titan-08: 192.168.22.34
+  titan-09: 192.168.22.35
+  titan-10: 192.168.22.36
+  titan-11: 192.168.22.37
+  titan-12: 192.168.22.40
+  titan-13: 192.168.22.41
+  titan-14: 192.168.22.42
+  titan-15: 192.168.22.43
+  titan-17: 192.168.22.45
+  titan-18: 192.168.22.46
+  titan-19: 192.168.22.47
+  titan-20: 192.168.22.20
+  titan-21: 192.168.22.21
+  titan-22: 192.168.22.22
+  titan-24: 192.168.22.26'
+    managed_block='ssh_managed_nodes:
+  - titan-db
+  - titan-0a
+  - titan-0b
+  - titan-0c
+  - titan-04
+  - titan-05
+  - titan-06
+  - titan-07
+  - titan-08
+  - titan-09
+  - titan-10
+  - titan-11
+  - titan-12
+  - titan-13
+  - titan-14
+  - titan-15
+  - titan-17
+  - titan-18
+  - titan-19
+  - titan-20
+  - titan-21
+  - titan-22
+  - titan-24'
+  fi
+
+  if [[ -n "${inventory_block}" ]]; then
+    if grep -Eq '^ssh_node_hosts:[[:space:]]*\{\}[[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
+      perl -0pi -e 's#ssh_node_hosts:\s*\{\}\n#'"${inventory_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
+      echo "[install] hydrated ssh_node_hosts inventory for role=${role}"
+      changed=1
+    fi
+  fi
+  if grep -Eq '^workers:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
+    perl -0pi -e 's#workers:\s*\[\]\n#'"${workers_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
+    echo "[install] hydrated workers inventory for startup/shutdown orchestration"
+    changed=1
+  fi
+
+  if [[ -n "${managed_block}" ]]; then
+    if grep -Eq '^ssh_managed_nodes:[[:space:]]*\[\][[:space:]]*$' "${CONF_DIR}/ananke.yaml"; then
+      perl -0pi -e 's#ssh_managed_nodes:\s*\[\]\n#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
+      echo "[install] hydrated ssh_managed_nodes inventory for role=${role}"
+      changed=1
+    fi
+    if ! grep -Eq '^  - titan-04$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - titan-21$' "${CONF_DIR}/ananke.yaml"; then
+      perl -0pi -e 's#ssh_managed_nodes:\n(?:  - [^\n]*\n)*#'"${managed_block}"'\n#s' "${CONF_DIR}/ananke.yaml"
+      echo "[install] refreshed ssh_managed_nodes coverage for role=${role}"
+      changed=1
+    fi
+  fi
+
+  if [[ "${role}" == "peer" ]]; then
+    if grep -Eq '^ssh_managed_nodes:[[:space:]]*$' "${CONF_DIR}/ananke.yaml" \
+      && grep -Eq '^  - titan-db$' "${CONF_DIR}/ananke.yaml" \
+      && grep -Eq '^  - titan-24$' "${CONF_DIR}/ananke.yaml" \
+      && ! grep -Eq '^  - titan-0a$' "${CONF_DIR}/ananke.yaml"; then
+      perl -0pi -e 's#ssh_managed_nodes:\n  - titan-db\n  - titan-24\n#ssh_managed_nodes:\n  - titan-db\n  - titan-0a\n  - titan-0b\n  - titan-0c\n  - titan-04\n  - titan-05\n  - titan-06\n  - titan-07\n  - titan-08\n  - titan-09\n  - titan-10\n  - titan-11\n  - titan-12\n  - titan-13\n  - titan-14\n  - titan-15\n  - titan-17\n  - titan-18\n  - titan-19\n  - titan-20\n  - titan-21\n  - titan-22\n  - titan-24\n#s' "${CONF_DIR}/ananke.yaml"
+      echo "[install] expanded peer ssh_managed_nodes for bootstrap fallback coverage"
+      changed=1
+    fi
+
+    if ! grep -Eq '^  - services/keycloak$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - infrastructure/cert-manager$' "${CONF_DIR}/ananke.yaml" || ! grep -Eq '^  - services/oauth2-proxy$' "${CONF_DIR}/ananke.yaml"; then
+      perl -0pi -e 's#local_bootstrap_paths:\n(?:  - [^\n]*\n)*#local_bootstrap_paths:\n  - infrastructure/core\n  - clusters/atlas/flux-system\n  - infrastructure/sources/helm\n  - infrastructure/metallb\n  - infrastructure/traefik\n  - infrastructure/cert-manager\n  - infrastructure/vault-csi\n  - infrastructure/vault-injector\n  - services/vault\n  - infrastructure/postgres\n  - services/gitea\n  - services/keycloak\n  - services/oauth2-proxy\n#s' "${CONF_DIR}/ananke.yaml"
+      echo "[install] refreshed peer local_bootstrap_paths for full fallback bootstrap parity"
+      changed=1
+    fi
+
+  fi
+
+  if [[ "${changed}" -eq 1 ]]; then
+    chmod 0640 "${CONF_DIR}/ananke.yaml" || true
+  fi
+}
+
+sanitize_migrated_ananke_config() {
+  local cfg="${CONF_DIR}/ananke.yaml"
+  [[ -f "${cfg}" ]] || return 0
+
+  local tmp changed=0
+  tmp="$(mktemp)"
+
+  # Legacy migration bug guard:
+  # If root-level "- node" entries were accidentally appended after ssh_managed_nodes,
+  # drop those orphan entries until the next top-level key.
+  awk '
+    BEGIN {in_managed=0}
+    /^ssh_managed_nodes:[[:space:]]*$/ {in_managed=1; print; next}
+    {
+      if (in_managed) {
+        if ($0 ~ /^  - /) {print; next}
+        if ($0 ~ /^- /) {next}
+        if ($0 ~ /^[A-Za-z0-9_]+:[[:space:]]*/) {in_managed=0}
+      }
+      print
+    }
+  ' "${cfg}" > "${tmp}"
+
+  if ! cmp -s "${cfg}" "${tmp}"; then
+    mv "${tmp}" "${cfg}"
+    changed=1
+    echo "[install] sanitized malformed ssh_managed_nodes block in ${cfg}"
+  else
+    rm -f "${tmp}"
+  fi
+
+  if grep -Eq '^[[:space:]]*forward_shutdown_config:[[:space:]]*/etc/ananke/hecate.yaml[[:space:]]*$' "${cfg}"; then
+    sed -Ei 's#(^[[:space:]]*forward_shutdown_config:[[:space:]]*)/etc/ananke/hecate.yaml#\1/etc/ananke/ananke.yaml#' "${cfg}"
+    changed=1
+    echo "[install] migrated coordination.forward_shutdown_config to /etc/ananke/ananke.yaml"
+  fi
+
+  if [[ "${changed}" -eq 1 ]]; then
+    chmod 0640 "${cfg}" || true
+  fi
+}
+
+ensure_apt_packages() {
+  local missing=()
+  for pkg in "$@"; do
+    if ! dpkg -s "${pkg}" >/dev/null 2>&1; then
+      missing+=("${pkg}")
+    fi
+  done
+  if [[ ${#missing[@]} -eq 0 ]]; then
+    return 0
+  fi
+  echo "[install] apt install: ${missing[*]}"
+  export DEBIAN_FRONTEND=noninteractive
+  apt-get update -y
+  apt-get install -y "${missing[@]}"
+}
+
+install_kubectl_if_missing() {
+  if command -v kubectl >/dev/null 2>&1; then
+    return 0
+  fi
+  ensure_apt_packages kubernetes-client || true
+  if command -v kubectl >/dev/null 2>&1; then
+    return 0
+  fi
+  echo "[install] installing kubectl via upstream binary"
+  local arch
+  arch="$(uname -m)"
+  case "${arch}" in
+    x86_64) arch="amd64" ;;
+    aarch64|arm64) arch="arm64" ;;
+    *) echo "Unsupported arch for kubectl install: ${arch}" >&2; return 1 ;;
+  esac
+  local version
+  version="$(curl -fsSL https://dl.k8s.io/release/stable.txt)"
+  curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${version}/bin/linux/${arch}/kubectl"
+  chmod 0755 /usr/local/bin/kubectl
+}
+
+ensure_dependencies() {
+  if [[ "${INSTALL_DEPS}" -eq 0 ]]; then
+    echo "[install] skipping dependency installation"
+    return 0
+  fi
+  if ! command -v apt-get >/dev/null 2>&1; then
+    echo "This installer currently supports apt-based hosts only." >&2
+    exit 1
+  fi
+  ensure_apt_packages ca-certificates curl git openssh-client jq nut-client nut-server nut-monitor golang-go
+  install_kubectl_if_missing
+}
+
+legacy_path_rewrite() {
+  local src="$1"
+  local dst="$2"
+  sed \
+    -e 's#/etc/hecate/hecate.yaml#/etc/ananke/ananke.yaml#g' \
+    -e 's#/etc/hecate/kubeconfig#/etc/ananke/kubeconfig#g' \
+    -e 's#/var/lib/hecate/vault-unseal.key#/var/lib/ananke/vault-unseal.key#g' \
+    -e 's#/var/lib/hecate/hecate.lock#/var/lib/ananke/ananke.lock#g' \
+    -e 's#/opt/hecate#/opt/ananke#g' \
+    -e 's#/etc/hecate#/etc/ananke#g' \
+    -e 's#/var/lib/hecate#/var/lib/ananke#g' \
+    -e 's#/usr/local/bin/hecate#/usr/local/bin/ananke#g' \
+    -e 's#/usr/local/lib/hecate#/usr/local/lib/ananke#g' \
+    -e 's/hecate.yaml/ananke.yaml/g' \
+    -e 's/hecate.lock/ananke.lock/g' \
+    -e 's/hecate/ananke/g' \
+    -e 's/Hecate/Ananke/g' \
+    -e 's#hecate\.lock#ananke.lock#g' \
+    "${src}" > "${dst}"
+}
+
+migrate_legacy_hecate_install() {
+  local legacy_conf_dir="/etc/hecate"
+  local legacy_state_dir="/var/lib/hecate"
+  local legacy_systemd_dir="/etc/systemd/system"
+
+  install -d -m 0750 "${CONF_DIR}"
+  install -d -m 0750 "${STATE_DIR}"
+
+  if [[ ! -f "${CONF_DIR}/ananke.yaml" && -f "${legacy_conf_dir}/hecate.yaml" ]]; then
+    echo "[install] migrating legacy config ${legacy_conf_dir}/hecate.yaml -> ${CONF_DIR}/ananke.yaml"
+    legacy_path_rewrite "${legacy_conf_dir}/hecate.yaml" "${CONF_DIR}/ananke.yaml"
+    chmod 0640 "${CONF_DIR}/ananke.yaml"
+  fi
+
+  if [[ ! -f "${CONF_DIR}/kubeconfig" && -f "${legacy_conf_dir}/kubeconfig" ]]; then
+    echo "[install] migrating legacy kubeconfig ${legacy_conf_dir}/kubeconfig -> ${CONF_DIR}/kubeconfig"
+    install -m 0600 "${legacy_conf_dir}/kubeconfig" "${CONF_DIR}/kubeconfig"
+  fi
+
+  if [[ ! -f "${STATE_DIR}/vault-unseal.key" && -f "${legacy_state_dir}/vault-unseal.key" ]]; then
+    echo "[install] migrating legacy vault key ${legacy_state_dir}/vault-unseal.key -> ${STATE_DIR}/vault-unseal.key"
+    install -m 0600 "${legacy_state_dir}/vault-unseal.key" "${STATE_DIR}/vault-unseal.key"
+  fi
+
+  if [[ ! -f "${STATE_DIR}/runs.json" && -f "${legacy_state_dir}/runs.json" ]]; then
+    echo "[install] migrating legacy run history ${legacy_state_dir}/runs.json -> ${STATE_DIR}/runs.json"
+    install -m 0640 "${legacy_state_dir}/runs.json" "${STATE_DIR}/runs.json"
+  fi
+
+  if [[ ! -f "${STATE_DIR}/intent.json" && -f "${legacy_state_dir}/intent.json" ]]; then
+    echo "[install] migrating legacy intent state ${legacy_state_dir}/intent.json -> ${STATE_DIR}/intent.json"
+    install -m 0640 "${legacy_state_dir}/intent.json" "${STATE_DIR}/intent.json"
+  fi
+
+  if [[ ! -f "${STATE_DIR}/ananke.lock" && -f "${legacy_state_dir}/hecate.lock" ]]; then
+    echo "[install] migrating legacy lock ${legacy_state_dir}/hecate.lock -> ${STATE_DIR}/ananke.lock"
+    install -m 0640 "${legacy_state_dir}/hecate.lock" "${STATE_DIR}/ananke.lock"
+  fi
+
+  if [[ -d "${legacy_systemd_dir}" ]]; then
+    if ls "${legacy_systemd_dir}"/hecate*.service >/dev/null 2>&1 || ls "${legacy_systemd_dir}"/hecate*.timer >/dev/null 2>&1; then
+      echo "[install] detected legacy hecate systemd unit files; will retire after ananke install"
+    fi
+  fi
+}
+
+retire_legacy_hecate_install() {
+  local ts backup_dir
+  ts="$(date +%Y%m%d%H%M%S)"
+  backup_dir="/var/backups/ananke-legacy-hecate-${ts}"
+
+  systemctl disable --now hecate.service hecate-bootstrap.service hecate-update.timer >/dev/null 2>&1 || true
+  systemctl stop hecate-update.service >/dev/null 2>&1 || true
+
+  if [[ -d /etc/hecate || -d /var/lib/hecate || -d /usr/local/lib/hecate || -d /opt/hecate ]]; then
+    install -d -m 0750 "${backup_dir}"
+    [[ -d /etc/hecate ]] && cp -a /etc/hecate "${backup_dir}/" || true
+    [[ -d /var/lib/hecate ]] && cp -a /var/lib/hecate "${backup_dir}/" || true
+    [[ -d /usr/local/lib/hecate ]] && cp -a /usr/local/lib/hecate "${backup_dir}/" || true
+    [[ -d /opt/hecate ]] && cp -a /opt/hecate "${backup_dir}/" || true
+    [[ -f /usr/local/bin/hecate ]] && install -m 0755 /usr/local/bin/hecate "${backup_dir}/hecate.bin" || true
+    echo "[install] backed up legacy hecate assets to ${backup_dir}"
+  fi
+
+  rm -f \
+    /etc/systemd/system/hecate.service \
+    /etc/systemd/system/hecate-bootstrap.service \
+    /etc/systemd/system/hecate-update.service \
+    /etc/systemd/system/hecate-update.timer
+  rm -f /usr/local/bin/hecate
+  rm -rf /usr/local/lib/hecate
+  rm -rf /opt/hecate
+  rm -rf /etc/hecate
+  rm -rf /var/lib/hecate
+}
+
+resolve_build_target() {
+  if [[ -d "${REPO_DIR}/cmd/ananke" ]]; then
+    echo "./cmd/ananke"
+    return 0
+  fi
+  return 1
+}
+
+install_config_template() {
+  local template="$1"
+  local dest="$2"
+  local src legacy
+  local -a modern_candidates=()
+  local -a legacy_candidates=()
+
+  case "${template}" in
+    coordinator)
+      modern_candidates=("configs/ananke.coordinator.yaml" "configs/ananke.titan-db.yaml")
+      legacy_candidates=("configs/hecate.titan-db.yaml")
+      ;;
+    peer)
+      modern_candidates=("configs/ananke.peer.yaml" "configs/ananke.tethys.yaml")
+      legacy_candidates=("configs/hecate.tethys.yaml")
+      ;;
+    example)
+      modern_candidates=("configs/ananke.example.yaml")
+      legacy_candidates=("configs/hecate.example.yaml")
+      ;;
+    *)
+      echo "[install] unknown config template key: ${template}" >&2
+      return 1
+      ;;
+  esac
+
+  for src in "${modern_candidates[@]}"; do
+    if [[ -f "${src}" ]]; then
+      install -m 0640 "${src}" "${dest}"
+      return 0
+    fi
+  done
+
+  for legacy in "${legacy_candidates[@]}"; do
+    if [[ -f "${legacy}" ]]; then
+      src="$(mktemp)"
+      legacy_path_rewrite "${legacy}" "${src}"
+      install -m 0640 "${src}" "${dest}"
+      rm -f "${src}"
+      return 0
+    fi
+  done
+
+  echo "[install] missing config template sources for '${template}'. modern=[${modern_candidates[*]}] legacy=[${legacy_candidates[*]}]" >&2
+  return 1
+}
+
+install_systemd_units() {
+  local source_map
+  local tmp
+
+  while IFS='|' read -r target_name modern_name legacy_name; do
+    local modern_src="deploy/systemd/${modern_name}"
+    local legacy_src="deploy/systemd/${legacy_name}"
+    local target="${SYSTEMD_DIR}/${target_name}"
+
+    if [[ -f "${modern_src}" ]]; then
+      install -m 0644 "${modern_src}" "${target}"
+      continue
+    fi
+
+    if [[ -f "${legacy_src}" ]]; then
+      tmp="$(mktemp)"
+      legacy_path_rewrite "${legacy_src}" "${tmp}"
+      install -m 0644 "${tmp}" "${target}"
+      rm -f "${tmp}"
+      continue
+    fi
+
+    echo "[install] missing both modern and legacy systemd unit sources for ${target_name}" >&2
+    return 1
+  done <<'EOF_UNITS'
+ananke.service|ananke.service|hecate.service
+ananke-bootstrap.service|ananke-bootstrap.service|hecate-bootstrap.service
+ananke-update.service|ananke-update.service|hecate-update.service
+ananke-update.timer|ananke-update.timer|hecate-update.timer
+EOF_UNITS
+}
+
+install_self_update_script() {
+  local modern_src="scripts/ananke-self-update.sh"
+  local legacy_src="scripts/hecate-self-update.sh"
+  local target="${LIB_DIR}/ananke-self-update.sh"
+  local tmp
+
+  if [[ -f "${modern_src}" ]]; then
+    install -m 0755 "${modern_src}" "${target}"
+    return 0
+  fi
+
+  if [[ -f "${legacy_src}" ]]; then
+    tmp="$(mktemp)"
+    legacy_path_rewrite "${legacy_src}" "${tmp}"
+    sed -Ei \
+      -e 's/HECATE_/ANANKE_/g' \
+      -e 's/hecate-self-update/ananke-self-update/g' \
+      -e 's#/opt/hecate#/opt/ananke#g' \
+      -e 's#bstein/hecate\.git#bstein/ananke.git#g' \
+      "${tmp}"
+    install -m 0755 "${tmp}" "${target}"
+    rm -f "${tmp}"
+    return 0
+  fi
+
+  echo "[install] missing both modern and legacy self-update scripts." >&2
+  return 1
+}
+
+configure_nut() {
+  if [[ "${MANAGE_NUT}" != "1" ]]; then
+    echo "[install] skipping NUT configuration (ANANKE_MANAGE_NUT=${MANAGE_NUT})"
+    return 0
+  fi
+
+  echo "[install] configuring NUT + udev for UPS ${NUT_UPS_NAME} (${NUT_VENDOR_ID}:${NUT_PRODUCT_ID})"
+  install -d -m 0755 /etc/nut /etc/udev/rules.d
+
+  cat > /etc/nut/nut.conf <<EOF
+MODE=standalone
+EOF
+
+  cat > /etc/nut/ups.conf <<EOF
+[${NUT_UPS_NAME}]
+  driver = usbhid-ups
+  port = auto
+  vendorid = ${NUT_VENDOR_ID}
+  productid = ${NUT_PRODUCT_ID}
+  pollinterval = 5
+EOF
+
+  cat > /etc/nut/upsd.users <<EOF
+[${NUT_MONITOR_USER}]
+  password = ${NUT_MONITOR_PASSWORD}
+  upsmon primary
+EOF
+  chmod 0640 /etc/nut/upsd.users
+  if getent group nut >/dev/null 2>&1; then
+    chown root:nut /etc/nut/upsd.users
+  else
+    chown root:root /etc/nut/upsd.users
+  fi
+
+  cat > /etc/nut/upsmon.conf <<EOF
+RUN_AS_USER nut
+MONITOR ${NUT_UPS_NAME}@localhost 1 ${NUT_MONITOR_USER} ${NUT_MONITOR_PASSWORD} primary
+MINSUPPLIES 1
+SHUTDOWNCMD "/sbin/shutdown -h +0"
+POLLFREQ 5
+POLLFREQALERT 5
+HOSTSYNC 15
+DEADTIME 15
+POWERDOWNFLAG /etc/killpower
+EOF
+
+  cat > /etc/udev/rules.d/99-ananke-ups.rules <<EOF
+# Managed by ananke install.sh: ensure UPS USB HID devices are readable by NUT
+ACTION=="add|change", SUBSYSTEM=="usb", ATTR{idVendor}=="${NUT_VENDOR_ID}", ATTR{idProduct}=="${NUT_PRODUCT_ID}", MODE:="0660", GROUP:="nut"
+EOF
+
+  udevadm control --reload-rules || true
+  udevadm trigger --subsystem-match=usb --attr-match=idVendor="${NUT_VENDOR_ID}" --attr-match=idProduct="${NUT_PRODUCT_ID}" || true
+
+  systemctl enable nut-driver-enumerator.service nut-server.service nut-monitor.service >/dev/null 2>&1 || true
+  systemctl restart nut-driver-enumerator.service >/dev/null 2>&1 || true
+  systemctl restart "nut-driver@${NUT_UPS_NAME}.service" >/dev/null 2>&1 || true
+  systemctl restart nut-server.service nut-monitor.service >/dev/null 2>&1 || true
+}

 ensure_dependencies
 migrate_legacy_hecate_install
--- a/scripts/lint.sh
+++ b/scripts/lint.sh
@ -6,28 +6,9 @@ cd "${REPO_DIR}"
 export PATH="$(go env GOPATH)/bin:${PATH}"
 STATICCHECK_VERSION="${ANANKE_STATICCHECK_VERSION:-2025.1.1}"

-run_with_retry() {
-  local attempts="$1"
-  shift
-  local try=1
-  local delay=3
-  local rc=0
-  while true; do
-    "$@" && return 0
-    rc=$?
-    if [[ "${try}" -ge "${attempts}" ]]; then
-      return "${rc}"
-    fi
-    echo "[lint] retry ${try}/${attempts} after rc=${rc}: $*" >&2
-    sleep "${delay}"
-    delay=$((delay * 2))
-    try=$((try + 1))
-  done
-}
-
 if ! command -v staticcheck >/dev/null 2>&1 || ! staticcheck -version 2>/dev/null | grep -q "${STATICCHECK_VERSION}"; then
  echo "[lint] installing staticcheck ${STATICCHECK_VERSION}"
-  run_with_retry 4 go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}"
+  go install "honnef.co/go/tools/cmd/staticcheck@${STATICCHECK_VERSION}"
 fi

 echo "[lint] go vet"
--- a/scripts/publish_quality_metrics.py
+++ b/scripts/publish_quality_metrics.py
@ -77,17 +77,6 @@ def _fetch_existing_counter(pushgateway_url: str, metric: str, labels: dict[str,
    return 0.0


-def _series_exists(pushgateway_url: str, metric: str, labels: dict[str, str], timeout_seconds: float) -> bool:
-    """Return whether Pushgateway already has a series for this build."""
-    text = _read_http(f"{pushgateway_url.rstrip('/')}/metrics", timeout_seconds)
-    for line in text.splitlines():
-        if not line.startswith(metric + "{"):
-            continue
-        if all(f'{key}="{value}"' in line for key, value in labels.items()):
-            return True
-    return False
-
-
 def _build_payload(
    suite: str,
    trigger: str,
@ -100,25 +89,9 @@ def _build_payload(
    tests_skipped: int,
    test_cases: list[tuple[str, str]],
    coverage_percent: float,
-    source_files_total: int,
    source_lines_over_500: int,
-    branch: str,
-    build_number: str,
-    jenkins_job: str,
    checks: dict[str, str],
 ) -> str:
-    build_labels = {
-        "suite": suite,
-        "branch": branch,
-        "build_number": build_number or "unknown",
-        "jenkins_job": jenkins_job,
-    }
-    test_case_base_labels = {
-        "suite": suite,
-        "branch": branch,
-        "build_number": build_number or "unknown",
-        "jenkins_job": jenkins_job,
-    }
    lines = [
        "# TYPE platform_quality_gate_runs_total counter",
        f'platform_quality_gate_runs_total{{suite="{suite}",status="ok"}} {ok_count}',
@ -132,30 +105,21 @@ def _build_payload(
        f'ananke_quality_gate_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
        "# TYPE platform_quality_gate_workspace_line_coverage_percent gauge",
        f'platform_quality_gate_workspace_line_coverage_percent{{suite="{suite}"}} {coverage_percent:.3f}',
-        "# TYPE platform_quality_gate_source_files_total gauge",
-        f'platform_quality_gate_source_files_total{{suite="{suite}"}} {source_files_total}',
        "# TYPE platform_quality_gate_source_lines_over_500_total gauge",
        f'platform_quality_gate_source_lines_over_500_total{{suite="{suite}"}} {source_lines_over_500}',
-        "# TYPE platform_quality_gate_build_info gauge",
-        f"platform_quality_gate_build_info{_label_str(build_labels)} 1",
+        "# TYPE platform_quality_gate_test_case_result gauge",
        "# TYPE ananke_quality_gate_checks_total gauge",
        "# TYPE ananke_quality_gate_publish_info gauge",
        f'ananke_quality_gate_publish_info{_label_str({"suite": suite, "trigger": trigger})} 1',
    ]
+    lines.extend(
+        f'platform_quality_gate_test_case_result{{suite="{suite}",test="{_escape_label(test_name)}",status="{_escape_label(test_status)}"}} 1'
+        for test_name, test_status in test_cases
+    )
    lines.extend(
        f'ananke_quality_gate_checks_total{{suite="{suite}",check="{check_name}",result="{check_status}"}} 1'
        for check_name, check_status in checks.items()
    )
-    lines.append("# TYPE platform_quality_gate_test_case_result gauge")
-    if test_cases:
-        lines.extend(
-            f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': test_name, 'status': test_status})} 1"
-            for test_name, test_status in test_cases
-        )
-    else:
-        lines.append(
-            f"platform_quality_gate_test_case_result{_label_str({**test_case_base_labels, 'test': '__no_test_cases__', 'status': 'skipped'})} 1"
-        )
    return "\n".join(lines) + "\n"


@ -172,7 +136,8 @@ def _read_coverage_percent(path: str) -> float:
        return 0.0


-def _iter_source_files(repo_root: Path):
+def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
+    count = 0
    for rel_root in SOURCE_SCAN_ROOTS:
        base = repo_root / rel_root
        if not base.exists():
@ -182,37 +147,12 @@ def _iter_source_files(repo_root: Path):
                continue
            if path.suffix not in SOURCE_EXTENSIONS:
                continue
-            if path.name.endswith("_test.go") or path.name.endswith(".test.py"):
-                continue
-            yield path
-
-
-def _count_source_files(repo_root: Path) -> int:
-    return sum(1 for _ in _iter_source_files(repo_root))
-
-
-def _count_source_files_over_limit(repo_root: Path, max_lines: int = 500) -> int:
-    count = 0
-    for path in _iter_source_files(repo_root):
-        lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
-        if lines > max_lines:
-            count += 1
+            lines = len(path.read_text(encoding="utf-8", errors="ignore").splitlines())
+            if lines > max_lines:
+                count += 1
    return count


-def _unit_tests_failed(output_path: Path, coverage_percent: float) -> bool:
-    if coverage_percent <= 0 or not output_path.exists():
-        return True
-    text = output_path.read_text(encoding="utf-8", errors="ignore")
-    start_marker = "[quality] unit tests + workspace coverage profile"
-    end_marker = "[quality] hygiene: doc contracts"
-    if start_marker in text:
-        text = text.split(start_marker, 1)[1]
-    if end_marker in text:
-        text = text.split(end_marker, 1)[0]
-    return bool(re.search(r"^(--- FAIL:|FAIL\\b)", text, flags=re.M))
-
-
 def _parse_go_test_counts(output_path: Path) -> dict[str, int]:
    if not output_path.exists():
        return {"passed": 0, "failed": 0, "errors": 0, "skipped": 0}
@ -226,37 +166,14 @@ def _parse_go_test_counts(output_path: Path) -> dict[str, int]:


 def _parse_go_test_cases(output_path: Path) -> list[tuple[str, str]]:
-    """Parse per-test status records from go test output text."""
    if not output_path.exists():
        return []
    text = output_path.read_text(encoding="utf-8", errors="ignore")
    cases: list[tuple[str, str]] = []
-    patterns = {
-        "passed": re.compile(r"^--- PASS: ([^\s(]+)", flags=re.M),
-        "failed": re.compile(r"^--- FAIL: ([^\s(]+)", flags=re.M),
-        "skipped": re.compile(r"^--- SKIP: ([^\s(]+)", flags=re.M),
-    }
-    for status, pattern in patterns.items():
-        for test_name in pattern.findall(text):
-            cleaned = str(test_name).strip()
-            if cleaned:
-                cases.append((cleaned, status))
-    if cases:
-        return cases
-
-    # Fallback for non-verbose `go test` output where individual test names are absent.
-    package_cases: list[tuple[str, str]] = []
-    for package_name in re.findall(r"^ok\s+([^\s]+)", text, flags=re.M):
-        cleaned = str(package_name).strip()
-        if cleaned:
-            package_cases.append((f"package::{cleaned}", "passed"))
-    for package_name in re.findall(r"^FAIL\s+([^\s]+)", text, flags=re.M):
-        cleaned = str(package_name).strip()
-        if cleaned:
-            package_cases.append((f"package::{cleaned}", "failed"))
-    if package_cases:
-        deduped = list(dict.fromkeys(package_cases))
-        return deduped
+    for match in re.finditer(r"^---\s+(PASS|FAIL|SKIP):\s+(\S+)", text, flags=re.M):
+        raw_status, test_name = match.groups()
+        status = {"PASS": "passed", "FAIL": "failed", "SKIP": "skipped"}.get(raw_status, "error")
+        cases.append((test_name.strip(), status))
    return cases


@ -307,23 +224,17 @@ def _sonarqube_check_status(build_dir: Path) -> str:


 def _supply_chain_check_status(build_dir: Path) -> str:
-    required = os.getenv("QUALITY_GATE_IRONBANK_REQUIRED", "0").strip().lower() in {"1", "true", "yes", "on"}
    report = _load_json(Path(os.getenv("QUALITY_GATE_IRONBANK_REPORT", str(build_dir / "ironbank-compliance.json"))))
    if not report:
-        return "failed" if required else "not_applicable"
+        return "not_applicable"
    compliant = report.get("compliant")
    if isinstance(compliant, bool):
        return "ok" if compliant else "failed"
    status_candidates = [report.get("status"), report.get("result"), report.get("compliance")]
    for value in status_candidates:
        if isinstance(value, str):
-            normalized = value.strip().lower()
-            if normalized in QUALITY_SUCCESS_STATES:
-                return "ok"
-            if normalized in {"n/a", "na", "not_applicable", "not-applicable", "skipped", "skip"}:
-                return "failed" if required else "not_applicable"
-            return "failed" if required else "not_applicable"
-    return "failed" if required else "not_applicable"
+            return "ok" if value.strip().lower() in QUALITY_SUCCESS_STATES else "failed"
+    return "failed"


 def parse_args(argv: list[str]) -> argparse.Namespace:
@ -367,19 +278,10 @@ def main(argv: list[str] | None = None) -> int:
    args = parse_args(argv or sys.argv[1:])
    repo_root = Path(__file__).resolve().parents[1]
    build_dir = repo_root / "build"
-    gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
-    current_ok = 1 if gate_rc == 0 else 0
-    current_failed = 0 if gate_rc == 0 else 1

-    branch = os.getenv("BRANCH_NAME") or os.getenv("GIT_BRANCH") or "unknown"
-    if branch.startswith("origin/"):
-        branch = branch[len("origin/") :]
-    build_number = os.getenv("BUILD_NUMBER", "")
-    jenkins_job = os.getenv("JOB_NAME", "ananke")
    remote_ok = 0
    remote_failed = 0
    remote_error = ""
-    already_recorded = False
    try:
        remote_ok = int(
            _fetch_existing_counter(
@ -397,39 +299,21 @@ def main(argv: list[str] | None = None) -> int:
                args.timeout_seconds,
            )
        )
-        already_recorded = bool(build_number) and _series_exists(
-            args.pushgateway_url,
-            "platform_quality_gate_build_info",
-            {
-                "job": args.job_name,
-                "suite": args.suite,
-                "branch": branch or "unknown",
-                "build_number": build_number or "unknown",
-                "jenkins_job": jenkins_job,
-            },
-            args.timeout_seconds,
-        )
    except Exception as exc:
        remote_error = str(exc)

-    resolved_ok = remote_ok
-    resolved_failed = remote_failed
-    if remote_error:
-        resolved_ok = args.local_ok
-        resolved_failed = args.local_failed
-    elif not already_recorded:
-        resolved_ok += current_ok
-        resolved_failed += current_failed
+    resolved_ok = max(args.local_ok, remote_ok)
+    resolved_failed = max(args.local_failed, remote_failed)
    coverage_percent = _read_coverage_percent(args.coverage_percent_file)
-    source_files_total = _count_source_files(repo_root)
    source_lines_over_500 = _count_source_files_over_limit(repo_root, max_lines=500)
-    quality_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
-    tests = _parse_go_test_counts(quality_output)
-    test_cases = _parse_go_test_cases(quality_output)
+    test_output = Path(os.getenv("ANANKE_QUALITY_OUTPUT_FILE", str(build_dir / "quality-gate.out")))
+    tests = _parse_go_test_counts(test_output)
+    test_cases = _parse_go_test_cases(test_output)
+    gate_rc = _read_exit_code(Path(os.getenv("ANANKE_QUALITY_EXIT_CODE_PATH", str(build_dir / "quality-gate.rc"))))
    docs_status = _read_status(Path(os.getenv("ANANKE_QUALITY_DOCS_STATUS_PATH", str(build_dir / "docs-naming.status"))))
-    unit_tests_failed = _unit_tests_failed(quality_output, coverage_percent)
+    gate_failed = gate_rc != 0
    checks = {
-        "tests": "failed" if unit_tests_failed or tests["failed"] > 0 or tests["errors"] > 0 else "ok",
+        "tests": "failed" if gate_failed or tests["failed"] > 0 else "ok",
        "coverage": "ok" if coverage_percent >= 95.0 else "failed",
        "loc": "ok" if source_lines_over_500 == 0 else "failed",
        "docs_naming": docs_status,
@ -448,11 +332,7 @@ def main(argv: list[str] | None = None) -> int:
        tests_skipped=tests["skipped"],
        test_cases=test_cases,
        coverage_percent=coverage_percent,
-        source_files_total=source_files_total,
        source_lines_over_500=source_lines_over_500,
-        branch=branch,
-        build_number=build_number,
-        jenkins_job=jenkins_job,
        checks=checks,
    )

@ -465,8 +345,7 @@ def main(argv: list[str] | None = None) -> int:

    summary = (
        f"[quality] published Pushgateway metrics suite={args.suite} job={args.job_name} ok={resolved_ok} "
-        f"failed={resolved_failed} coverage={coverage_percent:.3f} source_files_total={source_files_total} "
-        f"source_lines_over_500={source_lines_over_500}"
+        f"failed={resolved_failed} coverage={coverage_percent:.3f} source_lines_over_500={source_lines_over_500}"
    )
    if remote_error:
        summary += f" remote_read_error={remote_error}"
--- a/scripts/publish_quality_metrics_test.py
+++ b/scripts/publish_quality_metrics_test.py
@ -3,11 +3,8 @@
 from __future__ import annotations

 import http.server
-from pathlib import Path
 import socketserver
-import tempfile
 import threading
-from unittest import mock
 import unittest

 import publish_quality_metrics as publisher
@ -61,19 +58,7 @@ class PublishQualityMetricsTest(unittest.TestCase):
        self.server.server_close()
        self.thread.join(timeout=5)

-    def _env_for_gate_status(self, status: int = 0) -> dict[str, str]:
-        tmp_dir = tempfile.TemporaryDirectory()
-        self.addCleanup(tmp_dir.cleanup)
-        rc_path = Path(tmp_dir.name) / "quality-gate.rc"
-        rc_path.write_text(f"{status}\n", encoding="utf-8")
-        return {
-            "ANANKE_QUALITY_EXIT_CODE_PATH": str(rc_path),
-            "ANANKE_QUALITY_COVERAGE_PERCENT_FILE": str(Path(tmp_dir.name) / "coverage.txt"),
-            "ANANKE_QUALITY_OUTPUT_FILE": str(Path(tmp_dir.name) / "quality-gate.out"),
-            "ANANKE_QUALITY_DOCS_STATUS_PATH": str(Path(tmp_dir.name) / "docs-naming.status"),
-        }
-
-    def test_publish_adds_current_run_to_remote_counters(self) -> None:
+    def test_publish_uses_remote_high_water_mark(self) -> None:
        _GatewayHandler.metrics_text = "\n".join(
            [
                '# TYPE platform_quality_gate_runs_total counter',
@ -82,93 +67,51 @@ class PublishQualityMetricsTest(unittest.TestCase):
            ]
        )

-        with mock.patch.dict("os.environ", self._env_for_gate_status(0)):
-            exit_code = publisher.main(
-                [
-                    "--pushgateway-url",
-                    self.base_url,
-                    "--job-name",
-                    "platform-quality-ci",
-                    "--suite",
-                    "ananke",
-                    "--trigger",
-                    "host",
-                    "--local-ok",
-                    "5",
-                    "--local-failed",
-                    "2",
-                ]
-            )
+        exit_code = publisher.main(
+            [
+                "--pushgateway-url",
+                self.base_url,
+                "--job-name",
+                "platform-quality-ci",
+                "--suite",
+                "ananke",
+                "--trigger",
+                "host",
+                "--local-ok",
+                "5",
+                "--local-failed",
+                "2",
+            ]
+        )

        self.assertEqual(exit_code, 0)
        self.assertEqual(len(_GatewayHandler.posts), 1)
        path, body = _GatewayHandler.posts[0]
        self.assertEqual(path, "/metrics/job/platform-quality-ci/suite/ananke")
-        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 8', body)
-        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body)
+        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
+        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 2', body)
        self.assertIn('ananke_quality_gate_publish_info{suite="ananke",trigger="host"} 1', body)
        self.assertIn('ananke_quality_gate_coverage_percent{suite="ananke"}', body)
        self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
-        self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
        self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)

-    def test_publish_does_not_double_count_same_build(self) -> None:
-        _GatewayHandler.metrics_text = "\n".join(
-            [
-                'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="ok"} 7',
-                'platform_quality_gate_runs_total{job="platform-quality-ci",suite="ananke",status="failed"} 1',
-                'platform_quality_gate_build_info{job="platform-quality-ci",suite="ananke",branch="main",build_number="78",jenkins_job="ananke"} 1',
-            ]
-        )
-        with mock.patch.dict(
-            "os.environ",
-            {
-                **self._env_for_gate_status(0),
-                "BRANCH_NAME": "main",
-                "BUILD_NUMBER": "78",
-                "JOB_NAME": "ananke",
-            },
-        ):
-            exit_code = publisher.main(
-                [
-                    "--pushgateway-url",
-                    self.base_url,
-                    "--job-name",
-                    "platform-quality-ci",
-                    "--suite",
-                    "ananke",
-                    "--trigger",
-                    "host",
-                    "--local-ok",
-                    "1",
-                    "--local-failed",
-                    "0",
-                ]
-            )
-
-        self.assertEqual(exit_code, 0)
-        _, body = _GatewayHandler.posts[0]
-        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 7', body)
-        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 1', body)
-
    def test_publish_falls_back_to_local_counters_when_metrics_read_fails(self) -> None:
        _GatewayHandler.fail_metrics_read = True

-        with mock.patch.dict("os.environ", self._env_for_gate_status(0)):
-            exit_code = publisher.main(
-                [
-                    "--pushgateway-url",
-                    self.base_url,
-                    "--job-name",
-                    "platform-quality-ci",
-                    "--suite",
-                    "ananke",
-                    "--local-ok",
-                    "11",
-                    "--local-failed",
-                    "3",
-                ]
-            )
+        exit_code = publisher.main(
+            [
+                "--pushgateway-url",
+                self.base_url,
+                "--job-name",
+                "platform-quality-ci",
+                "--suite",
+                "ananke",
+                "--local-ok",
+                "11",
+                "--local-failed",
+                "3",
+            ]
+        )

        self.assertEqual(exit_code, 0)
        self.assertEqual(len(_GatewayHandler.posts), 1)
@ -176,7 +119,6 @@ class PublishQualityMetricsTest(unittest.TestCase):
        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="ok"} 11', body)
        self.assertIn('platform_quality_gate_runs_total{suite="ananke",status="failed"} 3', body)
        self.assertIn('platform_quality_gate_workspace_line_coverage_percent{suite="ananke"}', body)
-        self.assertIn('platform_quality_gate_source_files_total{suite="ananke"}', body)
        self.assertIn('platform_quality_gate_source_lines_over_500_total{suite="ananke"}', body)


--- a/scripts/quality_gate.sh
+++ b/scripts/quality_gate.sh
@ -158,9 +158,15 @@ mkdir -p "${BUILD_DIR}"
 rm -f "${COVERAGE_PROFILE}" "${COVERAGE_PERCENT_FILE}"
 printf 'failed\n' > "${BUILD_DIR}/docs-naming.status"

-echo "[quality] dependency download"
+echo "[quality] unit tests + workspace coverage profile"
 export GOPROXY="${GOPROXY:-https://proxy.golang.org,direct}"
 run_with_retry 4 go mod download
+run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
+coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
+if [[ -z "${coverage_percent}" ]]; then
+  coverage_percent="0"
+fi
+printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"

 echo "[quality] hygiene: doc contracts"
 cd testing
@ -183,14 +189,6 @@ echo "[quality] lint"
 echo "[quality] installer template contracts"
 ./scripts/verify_install_templates.sh

-echo "[quality] unit tests + workspace coverage profile"
-run_with_retry 3 go test -coverprofile="${COVERAGE_PROFILE}" ./...
-coverage_percent="$(go tool cover -func="${COVERAGE_PROFILE}" | awk '/^total:/ {gsub("%","",$3); print $3}')"
-if [[ -z "${coverage_percent}" ]]; then
-  coverage_percent="0"
-fi
-printf '%s\n' "${coverage_percent}" > "${COVERAGE_PERCENT_FILE}"
-
 echo "[quality] per-file coverage gate (95%)"
 cd testing
 ANANKE_ENFORCE_COVERAGE=1 ANANKE_PER_FILE_COVERAGE_TARGET=95 go test ./coverage -run TestPerFileCoverageReport -count=1 -v
--- a/testing/hygiene/hygiene_test.go
+++ b/testing/hygiene/hygiene_test.go
@ -17,12 +17,6 @@ import (
 const maxGoFileLOC = 500

 var goFileNamePattern = regexp.MustCompile(`^[a-z0-9]+(_[a-z0-9]+)*(_test)?\.go$`)
-var genericFileNameTokens = map[string]struct{}{
-	"chunk": {},
-	"part":  {},
-	"piece": {},
-	"split": {},
-}

 func repoRoot(tb testing.TB) string {
 	tb.Helper()
@ -67,16 +61,13 @@ func collectGoFiles(tb testing.TB, roots ...string) []string {
 func TestHygieneContracts(t *testing.T) {
 	root := repoRoot(t)
 	files := collectGoFiles(t, filepath.Join(root, "cmd"), filepath.Join(root, "internal"))
-	namingFiles := append([]string{}, files...)
-	namingFiles = append(namingFiles, collectGoFiles(t, filepath.Join(root, "testing"))...)
 	sort.Strings(files)
-	sort.Strings(namingFiles)

 	t.Run("doc_contract", func(t *testing.T) {
 		checkDocContracts(t, files)
 	})
 	t.Run("naming_contract", func(t *testing.T) {
-		checkNamingContracts(t, namingFiles)
+		checkNamingContracts(t, files)
 	})
 	t.Run("loc_limit", func(t *testing.T) {
 		checkFileLOCLimits(t, files)
@ -130,19 +121,9 @@ func checkNamingContracts(t *testing.T, files []string) {
 		if !goFileNamePattern.MatchString(base) {
 			t.Errorf("%s: filename %q violates naming contract %s", file, base, goFileNamePattern.String())
 		}
-		for _, token := range filenameTokens(base) {
-			if _, ok := genericFileNameTokens[token]; ok {
-				t.Errorf("%s: filename %q uses generic split-file token %q", file, base, token)
-			}
-		}
 	}
 }

-func filenameTokens(name string) []string {
-	trimmed := strings.TrimSuffix(strings.TrimSuffix(name, ".go"), "_test")
-	return strings.Split(trimmed, "_")
-}
-
 // checkFileLOCLimits runs one orchestration or CLI step.
 // Signature: checkFileLOCLimits(t *testing.T, files []string).
 // Why: A strict LOC cap forces focused files and keeps refactors manageable.
--- a/testing/hygiene/in_tree_test_allowlist.txt
+++ b/testing/hygiene/in_tree_test_allowlist.txt
@ -13,8 +13,6 @@ cmd/ananke/power_safety_test.go
 cmd/ananke/test_helpers_test.go
 internal/cluster/orchestrator_inventory_test.go
 internal/cluster/orchestrator_report_test.go
-internal/cluster/orchestrator_autorepair_test.go
-internal/cluster/orchestrator_autorepair_cleanup_test.go
 internal/cluster/orchestrator_test.go
 internal/cluster/orchestrator_unit_additional_test.go
 internal/cluster/orchestrator_vault_test.go
@ -23,7 +21,6 @@ internal/config/load_additional_test.go
 internal/config/validate_matrix_test.go
 internal/service/daemon_additional_test.go
 internal/service/daemon_coverage_closeout_test.go
-internal/service/daemon_poststart_autorepair_test.go
 internal/service/daemon_quality_branches_test.go
 internal/service/daemon_test.go
 internal/sshutil/repair_test.go
--- a/testing/orchestrator/hooks_access_failure_matrix_test.go
+++ b/testing/orchestrator/hooks_access_failure_matrix_test.go
@ -363,3 +363,4 @@ func TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T) {
 		}
 	})
 }
+
--- a/testing/orchestrator/hooks_flux_workload_matrix_test.go
+++ b/testing/orchestrator/hooks_flux_workload_matrix_test.go
@ -79,29 +79,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
 		}
 	})

-	t.Run("flux-health-required-kustomizations-ignore-optional-failures", func(t *testing.T) {
-		cfg := lifecycleConfig(t)
-		cfg.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/monitoring"}
-
-		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-			command := name + " " + strings.Join(args, " ")
-			switch {
-			case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
-				return `{"items":[
-{"metadata":{"namespace":"flux-system","name":"monitoring"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}},
-{"metadata":{"namespace":"flux-system","name":"jellyfin"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"optional still down"}]}}
-]}`, nil
-			default:
-				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-			}
-		}
-		orch, _ := newHookOrchestrator(t, cfg, run, run)
-		ready, detail, err := orch.TestHookFluxHealthReady(context.Background())
-		if err != nil || !ready || !strings.Contains(detail, "required kustomizations ready=") {
-			t.Fatalf("expected required flux kustomization scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
-		}
-	})
-
 	t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.WorkloadConvergenceWaitSeconds = 1
@ -168,42 +145,6 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
 		}
 	})

-	t.Run("required-workload-namespaces-ignore-optional-failure-pods", func(t *testing.T) {
-		cfg := lifecycleConfig(t)
-		cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
-		cfg.Startup.StuckPodGraceSeconds = 1
-
-		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-			command := name + " " + strings.Join(args, " ")
-			switch {
-			case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
-				return `{"items":[
-{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
-{"kind":"Deployment","metadata":{"namespace":"jellyfin","name":"pegasus"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
-]}`, nil
-			case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
-				return `{"items":[
-{"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"grafana"}]},"status":{"containerStatuses":[{"state":{"running":{}}}]}},
-{"metadata":{"namespace":"jellyfin","name":"pegasus-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"pegasus"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"pegasus"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}
-]}`, nil
-			default:
-				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-			}
-		}
-		orch, _ := newHookOrchestrator(t, cfg, run, run)
-		ready, detail, err := orch.TestHookWorkloadConvergenceReady(context.Background())
-		if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
-			t.Fatalf("expected required workload namespace scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
-		}
-		failures, err := orch.TestHookStartupFailurePods(context.Background())
-		if err != nil {
-			t.Fatalf("startup failure pod query: %v", err)
-		}
-		if len(failures) != 0 {
-			t.Fatalf("expected optional namespace failures to be ignored, got %v", failures)
-		}
-	})
-
 	t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
--- a/testing/orchestrator/hooks_vault_lifecycle_branch_matrix_test.go
+++ b/testing/orchestrator/hooks_vault_lifecycle_branch_matrix_test.go
@ -19,11 +19,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )

-// newHookOrchestratorWithRunnerMode runs one orchestration or CLI step.
-// Signature: newHookOrchestratorWithRunnerMode(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
-// Why: these scenarios needs dry-run and non-dry-run variants while keeping
+// newHookOrchestratorAdvanced runs one orchestration or CLI step.
+// Signature: newHookOrchestratorAdvanced(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
+// Why: this part10 matrix needs dry-run and non-dry-run variants while keeping
 // command dispatch deterministic from the top-level testing module.
-func newHookOrchestratorWithRunnerMode(
+func newHookOrchestratorAdvanced(
 	t *testing.T,
 	cfg config.Config,
 	dryRun bool,
@ -49,11 +49,11 @@ func newHookOrchestratorWithRunnerMode(
 	return orch, recorder
 }

-// TestHookVaultLifecycleBranchMatrix runs one orchestration or CLI step.
-// Signature: TestHookVaultLifecycleBranchMatrix(t *testing.T).
+// TestHookGapMatrixPart10LowFileClosure runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart10LowFileClosure(t *testing.T).
 // Why: closes remaining branch gaps on low-coverage orchestrator files using
 // targeted hook-level scenarios instead of brittle full-drill reruns.
-func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
+func TestHookGapMatrixPart10LowFileClosure(t *testing.T) {
 	t.Run("critical-vault-low-branches", func(t *testing.T) {
 		t.Run("vault-sealed-parse-error", func(t *testing.T) {
 			cfg := lifecycleConfig(t)
@ -64,7 +64,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			if _, err := orch.TestHookVaultSealed(context.Background()); err == nil || !strings.Contains(err.Error(), "parse vault status") {
 				t.Fatalf("expected vault status parse error branch, got %v", err)
 			}
@ -81,7 +81,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			if _, err := orch.TestHookVaultUnsealKey(context.Background()); err == nil || !strings.Contains(err.Error(), "vault-init unseal key is empty") {
 				t.Fatalf("expected empty decoded unseal key branch, got %v", err)
 			}
@ -90,7 +90,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 		t.Run("write-unseal-key-file-write-error", func(t *testing.T) {
 			cfg := lifecycleConfig(t)
 			cfg.Startup.VaultUnsealKeyFile = t.TempDir()
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
 			if err := orch.TestHookWriteVaultUnsealKeyFile("vault-key"); err == nil || !strings.Contains(err.Error(), "write vault unseal key file") {
 				t.Fatalf("expected write failure branch when key path is a directory, got %v", err)
 			}
@ -105,7 +105,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchNoValue, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runNoValue, runNoValue)
+			orchNoValue, _ := newHookOrchestratorAdvanced(t, cfg, false, runNoValue, runNoValue)
 			ready, err := orchNoValue.TestHookWorkloadReady(context.Background(), "vault", "statefulset", "vault")
 			if err != nil || ready {
 				t.Fatalf("expected no-value readiness branch, ready=%v err=%v", ready, err)
@ -124,7 +124,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orchEnsureErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runEnsureErr, runEnsureErr)
+			orchEnsureErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runEnsureErr, runEnsureErr)
 			if err := orchEnsureErr.TestHookEnsureCriticalStartupWorkloads(context.Background()); err == nil || !strings.Contains(err.Error(), "rollout failed") {
 				t.Fatalf("expected ensureCriticalStartupWorkloads wait error branch, got %v", err)
 			}
@ -139,7 +139,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchPhase, _ := newHookOrchestratorWithRunnerMode(t, cfgPhase, false, runPhase, runPhase)
+			orchPhase, _ := newHookOrchestratorAdvanced(t, cfgPhase, false, runPhase, runPhase)
 			if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") {
 				t.Fatalf("expected pod phase guard branch, got %v", err)
 			}
@ -170,7 +170,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return runFollowup(ctx, timeout, name, args...)
 			}
-			orchFollowup, _ := newHookOrchestratorWithRunnerMode(t, cfgFollowup, false, runFollowup, runSensitive)
+			orchFollowup, _ := newHookOrchestratorAdvanced(t, cfgFollowup, false, runFollowup, runSensitive)
 			if err := orchFollowup.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "vault status check failed") {
 				t.Fatalf("expected follow-up sealed status error branch, got %v", err)
 			}
@ -204,7 +204,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			err := orch.TestHookDrainWorkers(context.Background(), workers)
 			if err == nil || !strings.Contains(err.Error(), "drain workers had 5 errors") {
 				t.Fatalf("expected drain aggregation branch, got %v", err)
@ -217,7 +217,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 			cfg.SSHManagedNodes = []string{"titan-db"}
 			rec := &commandRecorder{}
 			base := lifecycleDispatcher(rec)
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
 			orch.TestHookRunSSHAcrossNodes(context.Background(), []string{"titan-db", "not-managed"}, "noop", "echo ok")
 			if !rec.contains("atlas@titan-db echo ok") {
 				t.Fatalf("expected managed ssh execution branch")
@ -233,7 +233,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			if _, err := orch.TestHookLatestEtcdSnapshotPath(context.Background(), "titan-db"); err == nil || !strings.Contains(err.Error(), "no etcd snapshots found") {
 				t.Fatalf("expected empty snapshot-list branch, got %v", err)
 			}
@ -250,7 +250,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchWorkers, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runWorkers, runWorkers)
+			orchWorkers, _ := newHookOrchestratorAdvanced(t, cfg, false, runWorkers, runWorkers)
 			workers, err := orchWorkers.TestHookEffectiveWorkers(context.Background())
 			if err != nil || len(workers) == 0 {
 				t.Fatalf("expected inventory worker fallback branch, workers=%v err=%v", workers, err)
@ -273,7 +273,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orchWrite, _ := newHookOrchestratorWithRunnerMode(t, cfgWrite, false, runWrite, runWrite)
+			orchWrite, _ := newHookOrchestratorAdvanced(t, cfgWrite, false, runWrite, runWrite)
 			if err := orchWrite.TestHookScaleDownApps(context.Background()); err == nil || !strings.Contains(err.Error(), "write scaled workload snapshot") {
 				t.Fatalf("expected scaled snapshot write-failure branch, got %v", err)
 			}
@ -294,7 +294,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orchReady, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runReady, runReady)
+			orchReady, _ := newHookOrchestratorAdvanced(t, cfg, false, runReady, runReady)
 			ready, detail, err := orchReady.TestHookFluxHealthReady(context.Background())
 			if err != nil || ready || !strings.Contains(detail, "ready=false") {
 				t.Fatalf("expected flux ready-reason fallback branch, ready=%v detail=%q err=%v", ready, detail, err)
@ -319,7 +319,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			ctx, cancel := context.WithCancel(context.Background())
 			cancel()
 			if err := orch.TestHookWaitForFluxHealth(ctx); !errors.Is(err, context.Canceled) {
@ -336,7 +336,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 			}
 			rec := &commandRecorder{}
 			base := lifecycleDispatcher(rec)
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, base, base)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, base, base)
 			if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
 				t.Fatalf("expected ensureRequiredNodeLabels skip/apply branches, got %v", err)
 			}
@ -347,7 +347,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {

 		t.Run("wait-for-startup-convergence-dryrun-and-critical-endpoint-fail", func(t *testing.T) {
 			cfgDry := lifecycleConfig(t)
-			orchDry, _ := newHookOrchestratorWithRunnerMode(t, cfgDry, true, nil, nil)
+			orchDry, _ := newHookOrchestratorAdvanced(t, cfgDry, true, nil, nil)
 			if err := orchDry.TestHookWaitForStartupConvergence(context.Background()); err != nil {
 				t.Fatalf("expected startup convergence dry-run fast path, got %v", err)
 			}
@ -365,7 +365,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchFail, _ := newHookOrchestratorWithRunnerMode(t, cfgFail, false, run, run)
+			orchFail, _ := newHookOrchestratorAdvanced(t, cfgFail, false, run, run)
 			if err := orchFail.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "query endpoints") {
 				t.Fatalf("expected critical-endpoint convergence failure branch, got %v", err)
 			}
@ -373,7 +373,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {

 		t.Run("ingress-namespace-discovery-empty-and-query-error", func(t *testing.T) {
 			cfg := lifecycleConfig(t)
-			orchEmpty, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
+			orchEmpty, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
 			namespaces, err := orchEmpty.TestHookDiscoverIngressNamespacesForHost(context.Background(), " ")
 			if err != nil || len(namespaces) != 0 {
 				t.Fatalf("expected empty-host fast path, namespaces=%v err=%v", namespaces, err)
@ -386,7 +386,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchErr, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, runErr, runErr)
+			orchErr, _ := newHookOrchestratorAdvanced(t, cfg, false, runErr, runErr)
 			if _, err := orchErr.TestHookDiscoverIngressNamespacesForHost(context.Background(), "metrics.bstein.dev"); err == nil || !strings.Contains(err.Error(), "query ingresses") {
 				t.Fatalf("expected ingress query error branch, got %v", err)
 			}
@ -412,7 +412,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				URL:              "http://" + listener.Addr().String() + "/health",
 				AcceptedStatuses: []int{200},
 			}}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, nil, nil)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, nil, nil)
 			ready, detail := orch.TestHookServiceChecklistReady(context.Background())
 			if ready || !strings.Contains(detail, "http://") {
 				t.Fatalf("expected service checklist URL-name fallback failure, ready=%v detail=%q", ready, detail)
@ -435,7 +435,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			ctx, cancel := context.WithCancel(context.Background())
 			cancel()
 			if err := orch.TestHookWaitForNodeInventoryReachability(ctx); !errors.Is(err, context.Canceled) {
@ -456,7 +456,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			ctx, cancel := context.WithCancel(context.Background())
 			cancel()
 			if err := orch.TestHookWaitForPostStartProbes(ctx); !errors.Is(err, context.Canceled) {
@ -478,7 +478,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			if err := orch.TestHookResumeFluxAndReconcile(context.Background()); err != nil {
 				t.Fatalf("expected resume flux warning-only branch, got %v", err)
 			}
@ -505,7 +505,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			ctx, cancel := context.WithCancel(context.Background())
 			cancel()
 			if err := orch.TestHookWaitForTimeSync(ctx, []string{"", "titan-db"}); !errors.Is(err, context.Canceled) {
@ -532,14 +532,14 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 				}
 			}
-			orch, _ := newHookOrchestratorWithRunnerMode(t, cfg, false, run, run)
+			orch, _ := newHookOrchestratorAdvanced(t, cfg, false, run, run)
 			if err := orch.TestHookWaitForWorkloadConvergence(context.Background()); err != nil {
 				t.Fatalf("expected workload convergence default-branch success, got %v", err)
 			}

 			cfgIgnore := lifecycleConfig(t)
 			cfgIgnore.Startup.AutoRecycleStuckPods = false
-			orchIgnoreDry, _ := newHookOrchestratorWithRunnerMode(t, cfgIgnore, true, run, run)
+			orchIgnoreDry, _ := newHookOrchestratorAdvanced(t, cfgIgnore, true, run, run)
 			now := time.Now().UTC().Add(-time.Hour)
 			orchIgnoreDry.TestHookMaybeAutoRecycleStuckPods(context.Background(), &now)
 			orchIgnoreDry.TestHookMaybeAutoHealCriticalWorkloadReplicas(context.Background(), &now)
@ -551,7 +551,7 @@ func TestHookVaultLifecycleBranchMatrix(t *testing.T) {
 				}
 				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
 			}
-			orchHealErr, _ := newHookOrchestratorWithRunnerMode(t, lifecycleConfig(t), false, runHealErr, runHealErr)
+			orchHealErr, _ := newHookOrchestratorAdvanced(t, lifecycleConfig(t), false, runHealErr, runHealErr)
 			if _, err := orchHealErr.TestHookHealCriticalWorkloadReplicas(context.Background()); err == nil || !strings.Contains(err.Error(), "query workloads") {
 				t.Fatalf("expected critical workload heal query-error branch, got %v", err)
 			}
--- a/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go
+++ b/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go
@ -20,7 +20,7 @@ import (

 // newLifecycleMatrixOrchestrator runs one orchestration or CLI step.
 // Signature: newLifecycleMatrixOrchestrator(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride, kubeconfig string) *cluster.Orchestrator.
-// Why: lifecycle cleanup scenarios need direct control over runner dry-run and kubeconfig branches.
+// Why: part11 needs direct control over runner dry-run and kubeconfig branches.
 func newLifecycleMatrixOrchestrator(
 	t *testing.T,
 	cfg config.Config,
@ -49,11 +49,11 @@ func newLifecycleMatrixOrchestrator(
 	return orch
 }

-// TestHookLifecycleCleanupRemainingClosure runs one orchestration or CLI step.
-// Signature: TestHookLifecycleCleanupRemainingClosure(t *testing.T).
+// TestHookGapMatrixPart11RemainingClosure runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart11RemainingClosure(t *testing.T).
 // Why: closes final branch gaps for lifecycle + remaining near-threshold
 // orchestrator files so per-file coverage reaches the enforced 95% target.
-func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
+func TestHookGapMatrixPart11RemainingClosure(t *testing.T) {
 	t.Run("critical-vault-final-closures", func(t *testing.T) {
 		t.Run("ensure-critical-cleanup-error-and-cleanup-branches", func(t *testing.T) {
 			cfg := lifecycleConfig(t)
@ -633,7 +633,7 @@ func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
 						switch {
 						case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
 							apiVersionCalls++
-							if apiVersionCalls <= 2 {
+							if apiVersionCalls == 1 {
 								return "", errors.New("api down")
 							}
 							return "v1.31.0", nil
--- a/testing/orchestrator/hooks_state_access_coordination_matrix_test.go
+++ b/testing/orchestrator/hooks_state_access_coordination_matrix_test.go
--- a/testing/orchestrator/hooks_timesync_flux_vault_matrix_test.go
+++ b/testing/orchestrator/hooks_timesync_flux_vault_matrix_test.go
@ -17,11 +17,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )

-// TestHookTimesyncAndStabilityMatrix runs one orchestration or CLI step.
-// Signature: TestHookTimesyncAndStabilityMatrix(t *testing.T).
+// TestHookGapMatrixPart2TimesyncAndStability runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart2TimesyncAndStability(t *testing.T).
 // Why: drives low-coverage time-sync, datastore parsing, and startup stability
 // branches from the top-level testing module.
-func TestHookTimesyncAndStabilityMatrix(t *testing.T) {
+func TestHookGapMatrixPart2TimesyncAndStability(t *testing.T) {
 	t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) {
 		cases := []struct {
 			line string
@ -162,11 +162,11 @@ func TestHookTimesyncAndStabilityMatrix(t *testing.T) {
 	})
 }

-// TestHookFluxScalingReportMatrix runs one orchestration or CLI step.
-// Signature: TestHookFluxScalingReportMatrix(t *testing.T).
+// TestHookGapMatrixPart2FluxScalingReport runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart2FluxScalingReport(t *testing.T).
 // Why: targets low branch density in flux-health, scaling snapshot handling,
 // and report sanitization helpers.
-func TestHookFluxScalingReportMatrix(t *testing.T) {
+func TestHookGapMatrixPart2FluxScalingReport(t *testing.T) {
 	t.Run("flux-helper-matrix", func(t *testing.T) {
 		if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") {
 			t.Fatalf("expected immutable matcher true for uppercase+job variant")
@ -241,11 +241,11 @@ func TestHookFluxScalingReportMatrix(t *testing.T) {
 	})
 }

-// TestHookVaultAndCoordinationMatrix runs one orchestration or CLI step.
-// Signature: TestHookVaultAndCoordinationMatrix(t *testing.T).
+// TestHookGapMatrixPart2VaultAndCoordination runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart2VaultAndCoordination(t *testing.T).
 // Why: raises branch coverage on vault/key and coordination helpers without
 // requiring package-local tests.
-func TestHookVaultAndCoordinationMatrix(t *testing.T) {
+func TestHookGapMatrixPart2VaultAndCoordination(t *testing.T) {
 	t.Run("vault-unseal-and-file-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.VaultUnsealKeyFile = ""
@ -296,11 +296,11 @@ func TestHookVaultAndCoordinationMatrix(t *testing.T) {
 	})
 }

-// TestHookWorkloadIgnoreMatrix runs one orchestration or CLI step.
-// Signature: TestHookWorkloadIgnoreMatrix(t *testing.T).
+// TestHookGapMatrixPart2WorkloadIgnore runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart2WorkloadIgnore(t *testing.T).
 // Why: expands low branch coverage in workload ignore helpers and startup-failure
 // pod classification.
-func TestHookWorkloadIgnoreMatrix(t *testing.T) {
+func TestHookGapMatrixPart2WorkloadIgnore(t *testing.T) {
 	t.Run("ignored-node-helper-matrix", func(t *testing.T) {
 		if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) {
 			t.Fatalf("expected selector-host ignored match")
--- a/testing/orchestrator/hooks_convergence_lifecycle_restore_matrix_test.go
+++ b/testing/orchestrator/hooks_convergence_lifecycle_restore_matrix_test.go
@ -11,11 +11,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/config"
 )

-// TestHookConvergenceAndStabilityMatrix runs one orchestration or CLI step.
-// Signature: TestHookConvergenceAndStabilityMatrix(t *testing.T).
+// TestHookGapMatrixPart3ConvergenceAndStability runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T).
 // Why: raises coverage for startup convergence orchestration and stability gates
 // that determine whether startup is considered truly complete.
-func TestHookConvergenceAndStabilityMatrix(t *testing.T) {
+func TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T) {
 	t.Run("wait-for-startup-convergence-gate-matrix", func(t *testing.T) {
 		cfgIngress := lifecycleConfig(t)
 		cfgIngress.Startup.RequireIngressChecklist = true
@ -108,11 +108,11 @@ func TestHookConvergenceAndStabilityMatrix(t *testing.T) {
 	})
 }

-// TestHookLifecycleRestoreShutdownMatrix runs one orchestration or CLI step.
-// Signature: TestHookLifecycleRestoreShutdownMatrix(t *testing.T).
+// TestHookGapMatrixPart3LifecycleRestoreShutdown runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T).
 // Why: fills lifecycle restore/shutdown success paths that are easy to miss in
 // failure-focused drill tests.
-func TestHookLifecycleRestoreShutdownMatrix(t *testing.T) {
+func TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T) {
 	t.Run("etcd-restore-dry-run-and-success", func(t *testing.T) {
 		cfgDry := lifecycleConfig(t)
 		dry := newDryRunHookOrchestrator(t, cfgDry, nil)
--- a/testing/orchestrator/hooks_coordination_reachability_matrix_test.go
+++ b/testing/orchestrator/hooks_coordination_reachability_matrix_test.go
@ -19,11 +19,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )

-// TestHookCoordinationAndReachabilityMatrix runs one orchestration or CLI step.
-// Signature: TestHookCoordinationAndReachabilityMatrix(t *testing.T).
+// TestHookGapMatrixPart4CoordinationAndReachability runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T).
 // Why: closes remaining coordination/reachability low branches with deterministic
 // command responses and short timeouts.
-func TestHookCoordinationAndReachabilityMatrix(t *testing.T) {
+func TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T) {
 	t.Run("peer-shutdown-complete-cooldown-blocks-startup", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Coordination.PeerHosts = []string{"titan-24"}
@ -136,11 +136,11 @@ func TestHookCoordinationAndReachabilityMatrix(t *testing.T) {
 	})
 }

-// TestHookIngressServiceAndPostStartMatrix runs one orchestration or CLI step.
-// Signature: TestHookIngressServiceAndPostStartMatrix(t *testing.T).
+// TestHookGapMatrixPart4IngressServiceAndPostStart runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T).
 // Why: drives ingress/service checklist and post-start branches that were still
 // under-covered after drill-focused matrix tests.
-func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
+func TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T) {
 	t.Run("ingress-backend-autoscale-cooldown-and-host-parser", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
@ -194,11 +194,11 @@ func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
 		ok, detail := orch.TestHookServiceCheckReady(context.Background(), config.ServiceChecklistCheck{
-			Name:             "forbidden-marker",
-			URL:              srv.URL,
+			Name:            "forbidden-marker",
+			URL:             srv.URL,
 			AcceptedStatuses: []int{200},
-			BodyNotContains:  "marker",
-			TimeoutSeconds:   2,
+			BodyNotContains: "marker",
+			TimeoutSeconds:  2,
 		})
 		if ok || !strings.Contains(detail, "forbidden marker") {
 			t.Fatalf("expected forbidden-marker branch, ok=%v detail=%q", ok, detail)
@ -233,11 +233,11 @@ func TestHookIngressServiceAndPostStartMatrix(t *testing.T) {
 	})
 }

-// TestHookReportScalingStorageDrainMatrix runs one orchestration or CLI step.
-// Signature: TestHookReportScalingStorageDrainMatrix(t *testing.T).
+// TestHookGapMatrixPart4ReportScalingStorageDrain runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T).
 // Why: covers artifact, scaling snapshot, storage, and drain error branches that
 // are difficult to hit from happy-path lifecycle drills.
-func TestHookReportScalingStorageDrainMatrix(t *testing.T) {
+func TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T) {
 	t.Run("report-artifact-and-progress-error-paths", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		reportsFile := filepath.Join(t.TempDir(), "reports-as-file")
@ -339,11 +339,11 @@ func TestHookReportScalingStorageDrainMatrix(t *testing.T) {
 	})
 }

-// TestHookTimesyncLifecycleAndAccessMatrix runs one orchestration or CLI step.
-// Signature: TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T).
+// TestHookGapMatrixPart4TimesyncLifecycleAndAccess runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T).
 // Why: closes remaining timing/access/lifecycle branches that still sat below
 // target after the earlier matrices.
-func TestHookTimesyncLifecycleAndAccessMatrix(t *testing.T) {
+func TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T) {
 	t.Run("timesync-quorum-and-strict-failure-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.TimeSyncMode = "quorum"
--- a/testing/orchestrator/hooks_endpoint_healing_ingress_mapping_test.go
+++ b/testing/orchestrator/hooks_endpoint_healing_ingress_mapping_test.go
@ -20,11 +20,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )

-// TestHookEndpointHealingCoverageClosure runs one orchestration or CLI step.
-// Signature: TestHookEndpointHealingCoverageClosure(t *testing.T).
+// TestHookGapMatrixPart5CoverageClosure runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart5CoverageClosure(t *testing.T).
 // Why: closes branch gaps that still remained after drill-style tests by driving
 // low-coverage orchestrator internals through the exported top-level hook surface.
-func TestHookEndpointHealingCoverageClosure(t *testing.T) {
+func TestHookGapMatrixPart5CoverageClosure(t *testing.T) {
 	t.Run("critical-endpoint-backend-heal-matrix", func(t *testing.T) {
 		t.Run("empty-namespace-service-noop", func(t *testing.T) {
 			orch, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
@ -491,10 +491,10 @@ func httpStatusHandler(code int, body string) func(http.ResponseWriter, *http.Re
 	}
 }

-// TestHookIngressHostMappingRegression runs one orchestration or CLI step.
-// Signature: TestHookIngressHostMappingRegression(t *testing.T).
+// TestHookGapMatrixPart5IngressHostMappingRegression runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T).
 // Why: ensures host parsing fallback paths stay stable for ingress/service checklist failures.
-func TestHookIngressHostMappingRegression(t *testing.T) {
+func TestHookGapMatrixPart5IngressHostMappingRegression(t *testing.T) {
 	cfg := lifecycleConfig(t)
 	cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
 		{Name: "metrics", URL: "https://metrics.bstein.dev/api/health"},
--- a/testing/orchestrator/hooks_vault_poststart_branch_matrix_test.go
+++ b/testing/orchestrator/hooks_vault_poststart_branch_matrix_test.go
@ -16,11 +16,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )

-// TestHookVaultPostStartBranchMatrix runs one orchestration or CLI step.
-// Signature: TestHookVaultPostStartBranchMatrix(t *testing.T).
-// Why: targets the remaining low branch paths after endpoint-healing coverage so per-file coverage
+// TestHookGapMatrixPart6CoverageClosure runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart6CoverageClosure(t *testing.T).
+// Why: targets the remaining low branch paths after part5 so per-file coverage
 // can move toward the strict 95% quality gate.
-func TestHookVaultPostStartBranchMatrix(t *testing.T) {
+func TestHookGapMatrixPart6CoverageClosure(t *testing.T) {
 	t.Run("critical-vault-and-poststart-branches", func(t *testing.T) {
 		t.Run("wait-vault-ready-dryrun-and-cancel", func(t *testing.T) {
 			cfg := lifecycleConfig(t)
--- a/testing/orchestrator/hooks_workload_storage_access_matrix_test.go
+++ b/testing/orchestrator/hooks_workload_storage_access_matrix_test.go
@ -14,11 +14,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )

-// TestHookWorkloadStorageAccessMatrix runs one orchestration or CLI step.
-// Signature: TestHookWorkloadStorageAccessMatrix(t *testing.T).
+// TestHookGapMatrixPart7CoverageClosure runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart7CoverageClosure(t *testing.T).
 // Why: closes additional low-coverage branches in convergence, storage, access,
 // flux, lifecycle, and sensitive command wrappers.
-func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
+func TestHookGapMatrixPart7CoverageClosure(t *testing.T) {
 	t.Run("workload-convergence-branch-matrix", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.WorkloadConvergenceWaitSeconds = 1
@ -165,32 +165,6 @@ func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
 				t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
 			}
 		})
-
-		t.Run("wait-for-node-gates-only-require-core-nodes", func(t *testing.T) {
-			cfg := lifecycleConfig(t)
-			cfg.Startup.RequireNodeSSHAuth = true
-			cfg.Startup.NodeSSHAuthWaitSeconds = 1
-			cfg.Startup.NodeSSHAuthPollSeconds = 1
-			cfg.Startup.NodeInventoryReachWaitSeconds = 1
-			cfg.Startup.NodeInventoryReachPollSeconds = 1
-			cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
-			cfg.Startup.NodeSSHAuthRequiredNodes = []string{"titan-db"}
-
-			run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				if name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "__ANANKE_") {
-					return "", errors.New("no route to host")
-				}
-				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-			}
-			orch, _ := newHookOrchestrator(t, cfg, run, run)
-			if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
-				t.Fatalf("expected ssh-auth gate to ignore non-core worker failure, got %v", err)
-			}
-			if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err != nil {
-				t.Fatalf("expected inventory reachability gate to ignore non-core worker failure, got %v", err)
-			}
-		})
 	})

 	t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {
--- a/testing/orchestrator/hooks_access_vault_lifecycle_matrix_test.go
+++ b/testing/orchestrator/hooks_access_vault_lifecycle_matrix_test.go
@ -19,11 +19,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )

-// TestHookAccessVaultLifecycleMatrix runs one orchestration or CLI step.
-// Signature: TestHookAccessVaultLifecycleMatrix(t *testing.T).
+// TestHookGapMatrixPart8CoverageClosure runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart8CoverageClosure(t *testing.T).
 // Why: closes additional low-coverage branches in access, vault, lifecycle,
 // ingress/service stability, and timesync/inventory orchestration paths.
-func TestHookAccessVaultLifecycleMatrix(t *testing.T) {
+func TestHookGapMatrixPart8CoverageClosure(t *testing.T) {
 	t.Run("access-ssh-and-branch-guard-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.RequireNodeSSHAuth = true
@ -331,11 +331,11 @@ func TestHookAccessVaultLifecycleMatrix(t *testing.T) {
 	})
 }

-// TestHookLifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
-// Signature: TestHookLifecycleStartupAutoRestoreBranch(t *testing.T).
+// TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T).
 // Why: covers Startup's API-failure->auto-restore retry path that is otherwise
 // hard to exercise in deterministic top-level tests.
-func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
+func TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T) {
 	cfg := lifecycleConfig(t)
 	cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
 	cfg.Startup.EtcdRestoreControlPlane = "titan-db"
@ -384,7 +384,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {
 		}
 	}
 	orch, _ := newHookOrchestrator(t, cfg, run, run)
-	err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lifecycle-auto-restore"})
+	err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "part8-auto-restore"})
 	if err != nil {
 		t.Fatalf("expected startup auto-restore path success, got %v", err)
 	}
@ -394,7 +394,7 @@ func TestHookLifecycleStartupAutoRestoreBranch(t *testing.T) {

 	cfgBadMode := lifecycleConfig(t)
 	orchBadMode, _ := newHookOrchestrator(t, cfgBadMode, nil, nil)
-	err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "lifecycle", Mode: "unknown-mode"})
+	err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "part8", Mode: "unknown-mode"})
 	if err == nil || !strings.Contains(err.Error(), "unsupported shutdown mode") {
 		t.Fatalf("expected shutdown unsupported-mode branch, got %v", err)
 	}
--- a/testing/orchestrator/hooks_access_flux_endpoint_matrix_test.go
+++ b/testing/orchestrator/hooks_access_flux_endpoint_matrix_test.go
@ -16,11 +16,11 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )

-// TestHookAccessCoordinationEndpointsMatrix runs one orchestration or CLI step.
-// Signature: TestHookAccessCoordinationEndpointsMatrix(t *testing.T).
+// TestHookGapMatrixPart9AccessCoordinationEndpoints runs one orchestration or CLI step.
+// Signature: TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T).
 // Why: closes uncovered statement ranges in access/fluxsource, coordination,
 // and critical-endpoint orchestration helpers.
-func TestHookAccessCoordinationEndpointsMatrix(t *testing.T) {
+func TestHookGapMatrixPart9AccessCoordinationEndpoints(t *testing.T) {
 	t.Run("access-fluxsource-uncovered-ranges", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Shutdown.SSHParallelism = 0
--- a/testing/orchestrator/hooks_ingress_service_matrix_test.go
+++ b/testing/orchestrator/hooks_ingress_service_matrix_test.go
@ -53,48 +53,6 @@ func TestHookIngressServiceMatrix(t *testing.T) {
 		}
 	})

-	t.Run("required-node-labels-skip-ignored-unavailable-nodes", func(t *testing.T) {
-		cfg := lifecycleConfig(t)
-		cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
-			"titan-09": {
-				"ananke.bstein.dev/harbor-bootstrap": "true",
-			},
-		}
-		cfg.Startup.IgnoreUnavailableNodes = []string{"titan-09"}
-		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-			command := name + " " + strings.Join(args, " ")
-			if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
-				t.Fatalf("expected ignored unavailable node labels to be skipped, got %q", command)
-			}
-			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-		}
-		orch, _ := newHookOrchestrator(t, cfg, run, run)
-		if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
-			t.Fatalf("expected ignored unavailable node label enforcement to be skipped, got %v", err)
-		}
-	})
-
-	t.Run("required-node-labels-skip-absent-non-core-nodes", func(t *testing.T) {
-		cfg := lifecycleConfig(t)
-		cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
-			"titan-09": {
-				"ananke.bstein.dev/harbor-bootstrap": "true",
-			},
-		}
-		cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
-		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-			command := name + " " + strings.Join(args, " ")
-			if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
-				return "", errors.New("Error from server (NotFound): nodes \"titan-09\" not found")
-			}
-			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-		}
-		orch, _ := newHookOrchestrator(t, cfg, run, run)
-		if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
-			t.Fatalf("expected absent non-core node label enforcement to be skipped, got %v", err)
-		}
-	})
-
 	t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
 		tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 			w.WriteHeader(http.StatusOK)
--- a/testing/orchestrator/hooks_lifecycle_deep_matrix_test.go
+++ b/testing/orchestrator/hooks_lifecycle_deep_matrix_test.go
@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"net"
+	"os"
 	"strings"
 	"testing"
 	"time"
@ -124,25 +125,20 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {

 	t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {
 		cfg := lifecycleFastConfig(t)
-		cfg.Startup.RequireNodeInventoryReach = false
-		cfg.Startup.ShutdownCooldownSeconds = 5
-		reads := 0
-		restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
-			if path != cfg.State.IntentPath {
-				return state.TestHookReadIntentDefault(path)
-			}
-			reads++
-			if reads == 1 {
-				return state.Intent{
-					State:     state.IntentShutdownComplete,
-					Reason:    "recent",
-					Source:    "test",
-					UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
-				}, nil
-			}
-			return state.Intent{}, errors.New("forced reread failure")
-		})
-		t.Cleanup(restoreRead)
+		cfg.Startup.ShutdownCooldownSeconds = 1
+		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
+			State:     state.IntentShutdownComplete,
+			Reason:    "recent",
+			Source:    "test",
+			UpdatedAt: time.Now().UTC(),
+		}); err != nil {
+			t.Fatalf("seed cooldown intent: %v", err)
+		}
+		go func(intentPath string) {
+			time.Sleep(150 * time.Millisecond)
+			_ = os.Remove(intentPath)
+			_ = os.Mkdir(intentPath, 0o755)
+		}(cfg.State.IntentPath)
 		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
 		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})
 		if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {
@ -152,30 +148,24 @@ func TestLifecycleDeepFailureMatrix(t *testing.T) {

 	t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {
 		cfg := lifecycleFastConfig(t)
-		cfg.Startup.RequireNodeInventoryReach = false
-		cfg.Startup.ShutdownCooldownSeconds = 5
-		reads := 0
-		restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
-			if path != cfg.State.IntentPath {
-				return state.TestHookReadIntentDefault(path)
-			}
-			reads++
-			if reads == 1 {
-				return state.Intent{
-					State:     state.IntentShutdownComplete,
-					Reason:    "recent",
-					Source:    "test",
-					UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
-				}, nil
-			}
-			return state.Intent{
+		cfg.Startup.ShutdownCooldownSeconds = 1
+		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
+			State:     state.IntentShutdownComplete,
+			Reason:    "recent",
+			Source:    "test",
+			UpdatedAt: time.Now().UTC(),
+		}); err != nil {
+			t.Fatalf("seed cooldown intent: %v", err)
+		}
+		go func(intentPath string) {
+			time.Sleep(150 * time.Millisecond)
+			_ = state.WriteIntent(intentPath, state.Intent{
 				State:     state.IntentShuttingDown,
 				Reason:    "peer-shutdown",
 				Source:    "test",
 				UpdatedAt: time.Now().UTC(),
-			}, nil
-		})
-		t.Cleanup(restoreRead)
+			})
+		}(cfg.State.IntentPath)
 		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
 		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})
 		if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {
--- a/testing/orchestrator/hooks_scheduling_storm_test.go
+++ b/testing/orchestrator/hooks_scheduling_storm_test.go
@ -1,432 +0,0 @@
-package orchestrator
-
-import (
-	"context"
-	"errors"
-	"strings"
-	"testing"
-	"time"
-
-	"scm.bstein.dev/bstein/ananke/internal/cluster"
-)
-
-// TestHookSchedulingStormHelpers runs one orchestration or CLI step.
-// Signature: TestHookSchedulingStormHelpers(t *testing.T).
-// Why: keeps scheduling-storm helper coverage in the split top-level testing module
-// required by the repo hygiene contract.
-func TestHookSchedulingStormHelpers(t *testing.T) {
-	if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "ollama-rs", "Deployment", "ollama"); !ok || got != "ai/deployment/ollama" {
-		t.Fatalf("unexpected deployment owner resolution: got=%q ok=%v", got, ok)
-	}
-	if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("storage", "StatefulSet", "nextcloud", "", ""); !ok || got != "storage/statefulset/nextcloud" {
-		t.Fatalf("unexpected statefulset owner resolution: got=%q ok=%v", got, ok)
-	}
-	if got, ok := cluster.TestHookSchedulingStormOwnerWorkload("ai", "ReplicaSet", "missing", "", ""); ok || got != "" {
-		t.Fatalf("expected missing replicaset owner lookup to fail, got=%q ok=%v", got, ok)
-	}
-
-	if got := cluster.TestHookEventObservationCount(3, 9); got != 9 {
-		t.Fatalf("expected series count to win, got %d", got)
-	}
-	if got := cluster.TestHookEventObservationCount(0, 0); got != 1 {
-		t.Fatalf("expected zero-count normalization to 1, got %d", got)
-	}
-
-	now := time.Now().UTC().Round(time.Second)
-	if got := cluster.TestHookEventLastObservedAt(now, now.Add(-time.Minute), now.Add(-2*time.Minute), now.Add(-3*time.Minute)); !got.Equal(now) {
-		t.Fatalf("expected series timestamp priority, got %s", got)
-	}
-	if got := cluster.TestHookEventLastObservedAt(time.Time{}, now, now.Add(-time.Minute), now.Add(-2*time.Minute)); !got.Equal(now) {
-		t.Fatalf("expected lastTimestamp fallback, got %s", got)
-	}
-	if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, now, now.Add(-time.Minute)); !got.Equal(now) {
-		t.Fatalf("expected eventTime fallback, got %s", got)
-	}
-	if got := cluster.TestHookEventLastObservedAt(time.Time{}, time.Time{}, time.Time{}, now); !got.Equal(now) {
-		t.Fatalf("expected creationTimestamp fallback, got %s", got)
-	}
-}
-
-// TestHookSchedulingStormQuarantine runs one orchestration or CLI step.
-// Signature: TestHookSchedulingStormQuarantine(t *testing.T).
-// Why: verifies that only non-core workloads generating real scheduling storms
-// are auto-quarantined, which prevents event/Kine churn from spiking control-plane CPU.
-func TestHookSchedulingStormQuarantine(t *testing.T) {
-	now := time.Now().UTC().Format(time.RFC3339)
-	cfg := lifecycleConfig(t)
-	cfg.Startup.AutoQuarantineSchedulingStorms = true
-	cfg.Startup.SchedulingStormEventThreshold = 30
-	cfg.Startup.SchedulingStormWindowSeconds = 180
-	cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault"}
-	cfg.Startup.IgnoreWorkloadNamespaces = []string{"ignored-ns"}
-	cfg.Startup.IgnoreWorkloads = []string{"monitoring/deployment/ignore-me"}
-	cfg.Startup.IgnoreUnavailableNodes = []string{"titan-22"}
-	scaledOllama := false
-
-	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-		command := name + " " + strings.Join(args, " ")
-		switch {
-		case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
-			return `{"items":[
-				{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}},
-				{"metadata":{"namespace":"vault","name":"vault-0","ownerReferences":[{"kind":"StatefulSet","name":"vault"}]},"spec":{},"status":{"phase":"Pending"}},
-				{"metadata":{"namespace":"ignored-ns","name":"skip-pod","ownerReferences":[{"kind":"ReplicaSet","name":"skip-rs"}]},"spec":{},"status":{"phase":"Pending"}},
-				{"metadata":{"namespace":"monitoring","name":"ignore-me-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignore-me-rs"}]},"spec":{},"status":{"phase":"Pending"}},
-				{"metadata":{"namespace":"monitoring","name":"ignored-node-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ignored-node-rs"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Pending"}},
-				{"metadata":{"namespace":"monitoring","name":"running-pod","ownerReferences":[{"kind":"ReplicaSet","name":"running-rs"}]},"spec":{},"status":{"phase":"Running"}}
-			]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
-			return `{"items":[
-				{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}},
-				{"metadata":{"namespace":"ignored-ns","name":"skip-rs","ownerReferences":[{"kind":"Deployment","name":"skip"}]}},
-				{"metadata":{"namespace":"monitoring","name":"ignore-me-rs","ownerReferences":[{"kind":"Deployment","name":"ignore-me"}]}},
-				{"metadata":{"namespace":"monitoring","name":"ignored-node-rs","ownerReferences":[{"kind":"Deployment","name":"ignored-node"}]}},
-				{"metadata":{"namespace":"monitoring","name":"running-rs","ownerReferences":[{"kind":"Deployment","name":"running"}]}}
-			]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
-			return `{"items":[
-				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
-				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"vault","name":"vault-0"},"type":"Warning","reason":"FailedScheduling","count":45},
-				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ignored-ns","name":"skip-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
-				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignore-me-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
-				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"ignored-node-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
-				{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"running-pod"},"type":"Warning","reason":"FailedScheduling","count":45},
-				{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"stale-pod"},"type":"Warning","reason":"FailedScheduling","count":99}
-			]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
-			return `{"items":[
-				{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}},
-				{"kind":"StatefulSet","metadata":{"namespace":"vault","name":"vault"},"spec":{"replicas":1}},
-				{"kind":"Deployment","metadata":{"namespace":"ignored-ns","name":"skip"},"spec":{"replicas":1}},
-				{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignore-me"},"spec":{"replicas":1}},
-				{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignored-node"},"spec":{"replicas":1}},
-				{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"running"},"spec":{"replicas":1}}
-			]}`, nil
-		case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
-			scaledOllama = true
-			return "", nil
-		default:
-			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-		}
-	}
-
-	orch, _ := newHookOrchestrator(t, cfg, run, run)
-	orch.TestHookBeginStartupReport("scheduling-storm")
-	defer orch.TestHookFinalizeStartupReport(nil)
-
-	if err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background()); err != nil {
-		t.Fatalf("quarantine scheduling storm workloads: %v", err)
-	}
-	if !scaledOllama {
-		t.Fatalf("expected ollama deployment to be scaled to zero")
-	}
-	progress := readStartupProgress(t, orch)
-	if !strings.Contains(progress, "ollama") {
-		t.Fatalf("expected startup progress to mention ollama quarantine, payload=%s", progress)
-	}
-	if strings.Contains(progress, "vault") || strings.Contains(progress, "ignore-me") || strings.Contains(progress, "ignored-node") {
-		t.Fatalf("expected only the non-core eligible workload to be quarantined, payload=%s", progress)
-	}
-}
-
-// TestHookSchedulingStormTriggerGuards runs one orchestration or CLI step.
-// Signature: TestHookSchedulingStormTriggerGuards(t *testing.T).
-// Why: covers dry-run/disabled/rate-limit guards so the scheduler-storm auto-heal
-// only activates when the cluster is actually suffering this exact failure mode.
-func TestHookSchedulingStormTriggerGuards(t *testing.T) {
-	cfgDisabled := lifecycleConfig(t)
-	orchDisabled, _ := newHookOrchestrator(t, cfgDisabled, nil, nil)
-	lastAttempt := time.Time{}
-	orchDisabled.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
-	if !lastAttempt.IsZero() {
-		t.Fatalf("expected disabled scheduling-storm trigger to be skipped")
-	}
-
-	cfgDry := lifecycleConfig(t)
-	cfgDry.Startup.AutoQuarantineSchedulingStorms = true
-	orchDry := newDryRunHookOrchestrator(t, cfgDry, nil)
-	orchDry.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
-	if !lastAttempt.IsZero() {
-		t.Fatalf("expected dry-run scheduling-storm trigger to be skipped")
-	}
-
-	cfgRate := lifecycleConfig(t)
-	cfgRate.Startup.AutoQuarantineSchedulingStorms = true
-	cfgRate.Startup.SchedulingStormEventThreshold = 5
-	cfgRate.Startup.SchedulingStormWindowSeconds = 60
-	recorder := &commandRecorder{}
-	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-		recorder.record(name, args)
-		command := name + " " + strings.Join(args, " ")
-		switch {
-		case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
-			return `{"items":[]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
-			return `{"items":[]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
-			return `{"items":[]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
-			return `{"items":[]}`, nil
-		default:
-			return lifecycleDispatcher(recorder)(ctx, timeout, name, args...)
-		}
-	}
-	orchRate, _ := newHookOrchestrator(t, cfgRate, run, run)
-	lastAttempt = time.Now()
-	orchRate.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
-	if recorder.contains("get pods -A -o json") {
-		t.Fatalf("expected rate-limited scheduling-storm trigger to skip kubectl scans")
-	}
-}
-
-// TestHookSchedulingStormTriggerAndNoOpBranches runs one orchestration or CLI step.
-// Signature: TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T).
-// Why: raises scheduling-storm branch coverage on the success/no-op paths so the
-// auto-heal only acts on genuine event storms and stays quiet otherwise.
-func TestHookSchedulingStormTriggerAndNoOpBranches(t *testing.T) {
-	cfg := lifecycleConfig(t)
-	cfg.Startup.AutoQuarantineSchedulingStorms = true
-	cfg.Startup.SchedulingStormEventThreshold = 0
-	cfg.Startup.SchedulingStormWindowSeconds = 0
-
-	scanRan := false
-	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-		command := name + " " + strings.Join(args, " ")
-		switch {
-		case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
-			scanRan = true
-			return `{"items":[
-				{"metadata":{"namespace":"","name":"missing"}},
-				{"metadata":{"namespace":"monitoring","name":"no-owner"},"spec":{},"status":{"phase":"Pending"}},
-				{"metadata":{"namespace":"monitoring","name":"done","ownerReferences":[{"kind":"ReplicaSet","name":"done-rs"}]},"spec":{},"status":{"phase":"Running"}},
-				{"metadata":{"namespace":"monitoring","name":"zero-replicas","ownerReferences":[{"kind":"ReplicaSet","name":"zero-rs"}]},"spec":{},"status":{"phase":"Pending"}}
-			]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
-			return `{"items":[
-				{"metadata":{"namespace":"","name":"bad-rs"}},
-				{"metadata":{"namespace":"monitoring","name":"done-rs","ownerReferences":[{"kind":"","name":"ignored"}]}},
-				{"metadata":{"namespace":"monitoring","name":"zero-rs","ownerReferences":[{"kind":"Deployment","name":"zero"}]}}
-			]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
-			return `{"items":[
-				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"normal"},"type":"Normal","reason":"FailedScheduling","count":99},
-				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"wrong-reason"},"type":"Warning","reason":"SomeOtherReason","count":99},
-				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Service","namespace":"monitoring","name":"wrong-kind"},"type":"Warning","reason":"FailedScheduling","count":99},
-				{"metadata":{"creationTimestamp":"2000-01-01T00:00:00Z"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"old"},"type":"Warning","reason":"FailedScheduling","count":99},
-				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"low-count"},"type":"Warning","reason":"FailedScheduling","count":1},
-				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"missing-pod"},"type":"Warning","reason":"FailedScheduling","count":99},
-				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"done"},"type":"Warning","reason":"FailedScheduling","count":99},
-				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"no-owner"},"type":"Warning","reason":"FailedScheduling","count":99},
-				{"metadata":{"creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"zero-replicas"},"type":"Warning","reason":"FailedScheduling","count":99}
-			]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
-			return `{"items":[
-				{"kind":"","metadata":{"namespace":"monitoring","name":"blank-kind"}},
-				{"kind":"Job","metadata":{"namespace":"monitoring","name":"unsupported"}},
-				{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"zero"},"spec":{"replicas":0}}
-			]}`, nil
-		default:
-			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-		}
-	}
-
-	orch, _ := newHookOrchestrator(t, cfg, run, run)
-	orch.TestHookBeginStartupReport("scheduling-storm-noop")
-	defer orch.TestHookFinalizeStartupReport(nil)
-
-	lastAttempt := time.Time{}
-	orch.TestHookMaybeAutoQuarantineSchedulingStorms(context.Background(), &lastAttempt)
-	if lastAttempt.IsZero() {
-		t.Fatalf("expected successful scheduling-storm trigger to update lastAttempt")
-	}
-	if !scanRan {
-		t.Fatalf("expected scheduling-storm scan to execute")
-	}
-	progress := readStartupProgress(t, orch)
-	if strings.Contains(progress, "quarantined scheduling storm workload") {
-		t.Fatalf("expected no-op scheduling-storm scan to avoid auto-heal output, payload=%s", progress)
-	}
-}
-
-// TestHookSchedulingStormErrorMatrix runs one orchestration or CLI step.
-// Signature: TestHookSchedulingStormErrorMatrix(t *testing.T).
-// Why: covers malformed/error response branches in the scheduling-storm scan so
-// Ananke can surface precise diagnostics when the API itself is part of the problem.
-func TestHookSchedulingStormErrorMatrix(t *testing.T) {
-	cases := []struct {
-		name    string
-		run     func(context.Context, time.Duration, string, ...string) (string, error)
-		wantErr string
-	}{
-		{
-			name: "pods-query-error",
-			run: func(_ context.Context, _ time.Duration, name string, _ ...string) (string, error) {
-				if name == "kubectl" {
-					return "", errors.New("pods boom")
-				}
-				return "", nil
-			},
-			wantErr: "query pods for scheduling storm scan",
-		},
-		{
-			name: "pods-decode-error",
-			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
-				if name == "kubectl" && strings.Contains(strings.Join(args, " "), "get pods -A -o json") {
-					return "{", nil
-				}
-				return `{"items":[]}`, nil
-			},
-			wantErr: "decode pods for scheduling storm scan",
-		},
-		{
-			name: "replicasets-query-error",
-			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				switch {
-				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
-					return "", errors.New("replicasets boom")
-				default:
-					return "", nil
-				}
-			},
-			wantErr: "query replicasets for scheduling storm scan",
-		},
-		{
-			name: "replicasets-decode-error",
-			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				switch {
-				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
-					return "{", nil
-				default:
-					return `{"items":[]}`, nil
-				}
-			},
-			wantErr: "decode replicasets for scheduling storm scan",
-		},
-		{
-			name: "events-query-error",
-			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				switch {
-				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
-					return "", errors.New("events boom")
-				default:
-					return "", nil
-				}
-			},
-			wantErr: "query events for scheduling storm scan",
-		},
-		{
-			name: "events-decode-error",
-			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				switch {
-				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
-					return "{", nil
-				default:
-					return `{"items":[]}`, nil
-				}
-			},
-			wantErr: "decode events for scheduling storm scan",
-		},
-		{
-			name: "workloads-query-error",
-			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				switch {
-				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
-					return "", errors.New("workloads boom")
-				default:
-					return "", nil
-				}
-			},
-			wantErr: "query workloads for scheduling storm scan",
-		},
-		{
-			name: "workloads-decode-error",
-			run: func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				switch {
-				case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
-					return `{"items":[]}`, nil
-				case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
-					return "{", nil
-				default:
-					return "", nil
-				}
-			},
-			wantErr: "decode workloads for scheduling storm scan",
-		},
-	}
-
-	for _, tc := range cases {
-		t.Run(tc.name, func(t *testing.T) {
-			cfg := lifecycleConfig(t)
-			cfg.Startup.AutoQuarantineSchedulingStorms = true
-			orch, _ := newHookOrchestrator(t, cfg, tc.run, tc.run)
-			err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
-			if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
-				t.Fatalf("expected error containing %q, got %v", tc.wantErr, err)
-			}
-		})
-	}
-}
-
-// TestHookSchedulingStormScaleError runs one orchestration or CLI step.
-// Signature: TestHookSchedulingStormScaleError(t *testing.T).
-// Why: covers the final error path where Ananke detects a real storm but cannot
-// scale the offending workload down.
-func TestHookSchedulingStormScaleError(t *testing.T) {
-	now := time.Now().UTC().Format(time.RFC3339)
-	cfg := lifecycleConfig(t)
-	cfg.Startup.AutoQuarantineSchedulingStorms = true
-	cfg.Startup.SchedulingStormEventThreshold = 5
-	cfg.Startup.SchedulingStormWindowSeconds = 60
-
-	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-		command := name + " " + strings.Join(args, " ")
-		switch {
-		case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
-			return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-pod","ownerReferences":[{"kind":"ReplicaSet","name":"ollama-rs"}]},"spec":{},"status":{"phase":"Pending"}}]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get replicasets -A -o json"):
-			return `{"items":[{"metadata":{"namespace":"ai","name":"ollama-rs","ownerReferences":[{"kind":"Deployment","name":"ollama"}]}}]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get events -A -o json"):
-			return `{"items":[{"metadata":{"creationTimestamp":"` + now + `"},"involvedObject":{"kind":"Pod","namespace":"ai","name":"ollama-pod"},"type":"Warning","reason":"FailedScheduling","count":45}]}`, nil
-		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
-			return `{"items":[{"kind":"Deployment","metadata":{"namespace":"ai","name":"ollama"},"spec":{"replicas":1}}]}`, nil
-		case name == "kubectl" && strings.Contains(command, "-n ai scale deployment ollama --replicas=0"):
-			return "", errors.New("scale denied")
-		default:
-			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-		}
-	}
-
-	orch, _ := newHookOrchestrator(t, cfg, run, run)
-	err := orch.TestHookQuarantineSchedulingStormWorkloads(context.Background())
-	if err == nil || !strings.Contains(err.Error(), "scale scheduling storm workload ai/deployment/ollama to 0") {
-		t.Fatalf("expected scale error, got %v", err)
-	}
-}
--- a/testing/orchestrator/hooks_startup_scope_vault_test.go
+++ b/testing/orchestrator/hooks_startup_scope_vault_test.go
@ -1,222 +0,0 @@
-package orchestrator
-
-import (
-	"context"
-	"errors"
-	"os"
-	"strings"
-	"testing"
-	"time"
-
-	"scm.bstein.dev/bstein/ananke/internal/cluster"
-)
-
-// readStartupProgress runs one orchestration or CLI step.
-// Signature: readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string.
-// Why: startup helper tests need to inspect progress artifacts without reaching
-// into internal package state from the top-level testing module.
-func readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string {
-	t.Helper()
-	payload, err := os.ReadFile(orch.TestHookStartupProgressPath())
-	if err != nil {
-		t.Fatalf("read startup progress: %v", err)
-	}
-	return string(payload)
-}
-
-// TestHookStartupScopeAndVaultHelpers runs one orchestration or CLI step.
-// Signature: TestHookStartupScopeAndVaultHelpers(t *testing.T).
-// Why: keeps startup-scope and startup-Vault helper branches covered from the
-// split top-level testing module required by the repo hygiene contract.
-func TestHookStartupScopeAndVaultHelpers(t *testing.T) {
-	t.Run("startup-scope-helpers", func(t *testing.T) {
-		nodes := []string{"titan-db", " titan-23 ", "", "titan-24"}
-		if got := cluster.TestHookStartupRequiredNodes(nodes, nil); len(got) != len(nodes) {
-			t.Fatalf("expected passthrough node list, got %v", got)
-		}
-		got := cluster.TestHookStartupRequiredNodes(nodes, []string{"titan-db", "titan-24"})
-		if len(got) != 2 || got[0] != "titan-db" || got[1] != "titan-24" {
-			t.Fatalf("unexpected filtered node list: %v", got)
-		}
-
-		if !cluster.TestHookContainsNode([]string{"titan-db", " titan-23 "}, " titan-23 ") {
-			t.Fatalf("expected trimmed node membership match")
-		}
-		if cluster.TestHookContainsNode([]string{"titan-db"}, " ") {
-			t.Fatalf("expected blank node probe to be ignored")
-		}
-
-		cfg := lifecycleConfig(t)
-		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
-		if !orch.TestHookStartupNodeStrictlyRequired("titan-23") {
-			t.Fatalf("expected all nodes to be strict when no recovery scopes are configured")
-		}
-
-		cfgScoped := lifecycleConfig(t)
-		cfgScoped.Startup.NodeInventoryReachRequiredNodes = []string{"titan-23"}
-		cfgScoped.Startup.NodeSSHAuthRequiredNodes = []string{"titan-24"}
-		cfgScoped.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", " flux-system/gitea "}
-		cfgScoped.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault", " monitoring "}
-		orchScoped, _ := newHookOrchestrator(t, cfgScoped, nil, nil)
-		if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-db") {
-			t.Fatalf("expected control plane to remain strict")
-		}
-		if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-23") {
-			t.Fatalf("expected inventory-scoped node to remain strict")
-		}
-		if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-24") {
-			t.Fatalf("expected ssh-scoped node to remain strict")
-		}
-		if orchScoped.TestHookStartupNodeStrictlyRequired("titan-25") {
-			t.Fatalf("expected non-core worker to stop being strict")
-		}
-
-		flux := orchScoped.TestHookStartupRequiredFluxKustomizations()
-		if _, ok := flux["flux-system/core"]; !ok {
-			t.Fatalf("expected core flux kustomization in required set: %v", flux)
-		}
-		if _, ok := flux["flux-system/gitea"]; !ok {
-			t.Fatalf("expected trimmed gitea kustomization in required set: %v", flux)
-		}
-
-		namespaces := orchScoped.TestHookStartupRequiredWorkloadNamespaces()
-		if _, ok := namespaces["vault"]; !ok {
-			t.Fatalf("expected vault namespace in required set: %v", namespaces)
-		}
-		if _, ok := namespaces["monitoring"]; !ok {
-			t.Fatalf("expected trimmed monitoring namespace in required set: %v", namespaces)
-		}
-	})
-
-	t.Run("startup-vault-helpers", func(t *testing.T) {
-		t.Run("early-vault-unseal-paths", func(t *testing.T) {
-			cfgAPI := lifecycleConfig(t)
-			runAPI := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
-					return "", errors.New("api down")
-				}
-				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-			}
-			orchAPI, _ := newHookOrchestrator(t, cfgAPI, runAPI, runAPI)
-			orchAPI.TestHookBeginStartupReport("startup-vault")
-			orchAPI.TestHookMaybeRunEarlyVaultUnseal(context.Background())
-			if payload := readStartupProgress(t, orchAPI); strings.Contains(payload, "vault-unseal-early") {
-				t.Fatalf("expected no early vault check when api is unavailable, payload=%s", payload)
-			}
-
-			cfgErr := lifecycleConfig(t)
-			runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				switch {
-				case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
-					return "v1.31.0", nil
-				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
-					return "", errors.New("phase probe failed")
-				default:
-					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-				}
-			}
-			orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
-			orchErr.TestHookBeginStartupReport("startup-vault")
-			orchErr.TestHookMaybeRunEarlyVaultUnseal(context.Background())
-			if payload := readStartupProgress(t, orchErr); !strings.Contains(payload, "deferred early vault unseal") {
-				t.Fatalf("expected early vault auto-heal detail, payload=%s", payload)
-			}
-
-			cfgDeferred := lifecycleConfig(t)
-			runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				switch {
-				case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
-					return "v1.31.0", nil
-				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
-					return "Pending", nil
-				default:
-					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-				}
-			}
-			orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
-			orchDeferred.TestHookBeginStartupReport("startup-vault")
-			orchDeferred.TestHookMaybeRunEarlyVaultUnseal(context.Background())
-			if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
-				t.Fatalf("expected deferred early vault detail, payload=%s", payload)
-			}
-
-			cfgSuccess := lifecycleConfig(t)
-			runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				switch {
-				case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
-					return "v1.31.0", nil
-				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
-					return "Running", nil
-				case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
-					return `{"sealed":false,"initialized":true}`, nil
-				default:
-					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-				}
-			}
-			orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
-			orchSuccess.TestHookBeginStartupReport("startup-vault")
-			orchSuccess.TestHookMaybeRunEarlyVaultUnseal(context.Background())
-			if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault-unseal-early"`) || !strings.Contains(payload, `"passed"`) {
-				t.Fatalf("expected successful early vault check, payload=%s", payload)
-			}
-		})
-
-		t.Run("startup-vault-gate-paths", func(t *testing.T) {
-			cfgErr := lifecycleConfig(t)
-			runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
-					return "", errors.New("phase probe failed")
-				}
-				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-			}
-			orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
-			orchErr.TestHookBeginStartupReport("startup-vault")
-			if err := orchErr.TestHookRunStartupVaultUnsealGate(context.Background()); err == nil || !strings.Contains(err.Error(), "phase probe failed") {
-				t.Fatalf("expected startup vault gate error, got %v", err)
-			}
-
-			cfgDeferred := lifecycleConfig(t)
-			runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
-					return "Pending", nil
-				}
-				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-			}
-			orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
-			orchDeferred.TestHookBeginStartupReport("startup-vault")
-			if err := orchDeferred.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
-				t.Fatalf("expected deferred startup vault gate to succeed, got %v", err)
-			}
-			if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
-				t.Fatalf("expected deferred startup vault detail, payload=%s", payload)
-			}
-
-			cfgSuccess := lifecycleConfig(t)
-			runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
-				command := name + " " + strings.Join(args, " ")
-				switch {
-				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
-					return "Running", nil
-				case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
-					return `{"sealed":false,"initialized":true}`, nil
-				default:
-					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
-				}
-			}
-			orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
-			orchSuccess.TestHookBeginStartupReport("startup-vault")
-			if err := orchSuccess.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
-				t.Fatalf("expected successful startup vault gate, got %v", err)
-			}
-			if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault is unsealed"`) {
-				t.Fatalf("expected successful startup vault detail, payload=%s", payload)
-			}
-		})
-	})
-}
--- a/testing/state/state_testhooks_quality_test.go
+++ b/testing/state/state_testhooks_quality_test.go
@ -24,36 +24,12 @@ func TestStateTestHookOverrideSetters(t *testing.T) {
 	}
 	restoreWriteNil()

-	restoreReadNil := state.TestHookSetReadIntentOverride(nil)
-	readAfterNil, err := state.ReadIntent(intentPath)
-	if err != nil || readAfterNil.State != state.IntentNormal {
-		t.Fatalf("expected default read intent path after nil override, got %v / %v", readAfterNil, err)
-	}
-	restoreReadNil()
-
-	readOverrideCalled := false
-	restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
-		readOverrideCalled = true
-		return state.Intent{}, errors.New("forced read override")
-	})
-	_, err = state.ReadIntent(intentPath)
-	if err == nil || !strings.Contains(err.Error(), "forced read override") {
-		t.Fatalf("expected forced read override error, got %v", err)
-	}
-	if !readOverrideCalled {
-		t.Fatalf("expected read override to be invoked")
-	}
-	restoreRead()
-	if _, err := state.TestHookReadIntentDefault(intentPath); err != nil {
-		t.Fatalf("expected explicit default read helper to succeed, got %v", err)
-	}
-
 	writeOverrideCalled := false
 	restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
 		writeOverrideCalled = true
 		return errors.New("forced write override")
 	})
-	err = state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
+	err := state.WriteIntent(intentPath, state.Intent{State: state.IntentNormal})
 	if err == nil || !strings.Contains(err.Error(), "forced write override") {
 		t.Fatalf("expected forced write override error, got %v", err)
 	}