From 1f656de5df82ae43b1184203bede9ea9786472d6 Mon Sep 17 00:00:00 2001
From: codex <codex@bstein.dev>
Date: Tue, 5 May 2026 05:17:59 -0300
Subject: [PATCH] startup(ananke): scope emergency recovery to core services

---
 README.md                                     |  14 +-
 configs/ananke.example.yaml                   |   5 +
 configs/ananke.tethys.yaml                    |  31 +++
 configs/ananke.titan-db.yaml                  |  31 +++
 .../cluster/orchestrator_access_fluxsource.go |   2 +-
 .../cluster/orchestrator_critical_vault.go    |  25 ++
 internal/cluster/orchestrator_fluxhealth.go   |  23 ++
 internal/cluster/orchestrator_ingress.go      |  10 +
 internal/cluster/orchestrator_lifecycle.go    |  29 +--
 .../cluster/orchestrator_node_reachability.go |   2 +-
 .../cluster/orchestrator_shutdown_mode.go     |  21 ++
 .../cluster/orchestrator_startup_scope.go     |  81 +++++++
 .../cluster/orchestrator_startup_vault.go     |  52 ++++
 .../orchestrator_workload_convergence.go      |   6 +
 .../cluster/orchestrator_workload_ignore.go   |   6 +
 internal/cluster/testing_hooks_startup.go     |  55 +++++
 internal/config/apply_defaults.go             |  18 +-
 internal/config/defaults.go                   |  94 ++++----
 internal/config/load_additional_test.go       |  38 +++
 internal/config/types.go                      | 123 +++++-----
 internal/config/validate.go                   |  51 ++++
 internal/config/validate_matrix_test.go       |  17 ++
 .../hooks_flux_workload_matrix_test.go        |  59 +++++
 .../hooks_ingress_service_matrix_test.go      |  42 ++++
 ...ks_lifecycle_cleanup_branch_matrix_test.go |   2 +-
 .../hooks_startup_scope_vault_test.go         | 222 ++++++++++++++++++
 ...oks_workload_storage_access_matrix_test.go |  26 ++
 27 files changed, 946 insertions(+), 139 deletions(-)
 create mode 100644 internal/cluster/orchestrator_shutdown_mode.go
 create mode 100644 internal/cluster/orchestrator_startup_scope.go
 create mode 100644 internal/cluster/orchestrator_startup_vault.go
 create mode 100644 internal/cluster/testing_hooks_startup.go
 create mode 100644 testing/orchestrator/hooks_startup_scope_vault_test.go

diff --git a/README.md b/README.md
index f8f4393..0b9673c 100644
--- a/README.md
+++ b/README.md
@@ -97,10 +97,15 @@ Primary config path:
 Keep these fields accurate:
 - `expected_flux_source_url`
 - `expected_flux_branch`
+- `startup.service_checklist_explicit_only`
 - `startup.service_checklist`
 - `startup.critical_service_endpoints`
 - `startup.require_ingress_checklist`
 - `startup.require_node_inventory_reachability`
+- `startup.node_inventory_reachability_required_nodes`
+- `startup.node_ssh_auth_required_nodes`
+- `startup.flux_health_required_kustomizations`
+- `startup.workload_convergence_required_namespaces`
 - `startup.ignore_unavailable_nodes`
 - `coordination.role`
 - `coordination.peer_hosts`
@@ -134,9 +139,10 @@ Installer behavior:
 
 When adding nodes or services:
 1. Update inventory and node mapping in config.
-2. Add/adjust service checklist entries for anything user-facing or critical.
-3. Add/adjust ingress expectations for exposed services.
-4. Use temporary ignores only when truly intentional, then remove them.
-5. Run `scripts/quality_gate.sh` before host deployment.
+2. Keep the explicit service checklist focused on the core services that must come back during an outage.
+3. Keep `*_required_*` startup scopes aligned with the same core set so optional stacks do not block bootstrap.
+4. Add/adjust ingress expectations for exposed services.
+5. Use temporary ignores only when truly intentional, then remove them.
+6. Run `scripts/quality_gate.sh` before host deployment.
 
 Recovery quality should improve over time: every drill should reduce manual work in the next drill.
diff --git a/configs/ananke.example.yaml b/configs/ananke.example.yaml
index f18aeb8..e12a1bf 100644
--- a/configs/ananke.example.yaml
+++ b/configs/ananke.example.yaml
@@ -51,6 +51,7 @@ startup:
   require_node_inventory_reachability: true
   node_inventory_reachability_wait_seconds: 300
   node_inventory_reachability_poll_seconds: 5
+  node_inventory_reachability_required_nodes: []
   required_node_labels:
     titan-09:
       ananke.bstein.dev/harbor-bootstrap: "true"
@@ -90,6 +91,7 @@ startup:
     admin_secret_name: keycloak-admin
     admin_secret_username_key: username
     admin_secret_password_key: password
+  service_checklist_explicit_only: false
   service_checklist:
     - name: gitea-api
       url: https://scm.bstein.dev/api/healthz
@@ -134,13 +136,16 @@ startup:
   require_node_ssh_auth: true
   node_ssh_auth_wait_seconds: 240
   node_ssh_auth_poll_seconds: 5
+  node_ssh_auth_required_nodes: []
   require_flux_health: true
   flux_health_wait_seconds: 900
   flux_health_poll_seconds: 5
+  flux_health_required_kustomizations: []
   ignore_flux_kustomizations: []
   require_workload_convergence: true
   workload_convergence_wait_seconds: 900
   workload_convergence_poll_seconds: 5
+  workload_convergence_required_namespaces: []
   ignore_workload_namespaces: []
   ignore_workloads: []
   ignore_unavailable_nodes: []
diff --git a/configs/ananke.tethys.yaml b/configs/ananke.tethys.yaml
index 378ee61..e8dfcba 100644
--- a/configs/ananke.tethys.yaml
+++ b/configs/ananke.tethys.yaml
@@ -117,6 +117,10 @@ startup:
   require_node_inventory_reachability: true
   node_inventory_reachability_wait_seconds: 300
   node_inventory_reachability_poll_seconds: 5
+  node_inventory_reachability_required_nodes:
+    - titan-0a
+    - titan-0b
+    - titan-0c
   required_node_labels:
     titan-09:
       ananke.bstein.dev/harbor-bootstrap: "true"
@@ -156,6 +160,7 @@ startup:
     admin_secret_name: keycloak-admin
     admin_secret_username_key: username
     admin_secret_password_key: password
+  service_checklist_explicit_only: true
   service_checklist:
     - name: gitea-api
       url: https://scm.bstein.dev/api/healthz
@@ -200,13 +205,39 @@ startup:
   require_node_ssh_auth: true
   node_ssh_auth_wait_seconds: 240
   node_ssh_auth_poll_seconds: 5
+  node_ssh_auth_required_nodes:
+    - titan-0a
+    - titan-0b
+    - titan-0c
   require_flux_health: true
   flux_health_wait_seconds: 900
   flux_health_poll_seconds: 5
+  flux_health_required_kustomizations:
+    - flux-system/core
+    - flux-system/helm
+    - flux-system/traefik
+    - flux-system/cert-manager
+    - flux-system/longhorn
+    - flux-system/vault-csi
+    - flux-system/vault-injector
+    - flux-system/postgres
+    - flux-system/vault
+    - flux-system/keycloak
+    - flux-system/oauth2-proxy
+    - flux-system/gitea
+    - flux-system/monitoring
+    - flux-system/harbor
   ignore_flux_kustomizations: []
   require_workload_convergence: true
   workload_convergence_wait_seconds: 900
   workload_convergence_poll_seconds: 5
+  workload_convergence_required_namespaces:
+    - vault
+    - postgres
+    - sso
+    - gitea
+    - monitoring
+    - harbor
   ignore_workload_namespaces: []
   ignore_workloads: []
   ignore_unavailable_nodes: []
diff --git a/configs/ananke.titan-db.yaml b/configs/ananke.titan-db.yaml
index d59a6b6..680c7ac 100644
--- a/configs/ananke.titan-db.yaml
+++ b/configs/ananke.titan-db.yaml
@@ -117,6 +117,10 @@ startup:
   require_node_inventory_reachability: true
   node_inventory_reachability_wait_seconds: 300
   node_inventory_reachability_poll_seconds: 5
+  node_inventory_reachability_required_nodes:
+    - titan-0a
+    - titan-0b
+    - titan-0c
   required_node_labels:
     titan-09:
       ananke.bstein.dev/harbor-bootstrap: "true"
@@ -156,6 +160,7 @@ startup:
     admin_secret_name: keycloak-admin
     admin_secret_username_key: username
     admin_secret_password_key: password
+  service_checklist_explicit_only: true
   service_checklist:
     - name: gitea-api
       url: https://scm.bstein.dev/api/healthz
@@ -200,13 +205,39 @@ startup:
   require_node_ssh_auth: true
   node_ssh_auth_wait_seconds: 240
   node_ssh_auth_poll_seconds: 5
+  node_ssh_auth_required_nodes:
+    - titan-0a
+    - titan-0b
+    - titan-0c
   require_flux_health: true
   flux_health_wait_seconds: 900
   flux_health_poll_seconds: 5
+  flux_health_required_kustomizations:
+    - flux-system/core
+    - flux-system/helm
+    - flux-system/traefik
+    - flux-system/cert-manager
+    - flux-system/longhorn
+    - flux-system/vault-csi
+    - flux-system/vault-injector
+    - flux-system/postgres
+    - flux-system/vault
+    - flux-system/keycloak
+    - flux-system/oauth2-proxy
+    - flux-system/gitea
+    - flux-system/monitoring
+    - flux-system/harbor
   ignore_flux_kustomizations: []
   require_workload_convergence: true
   workload_convergence_wait_seconds: 900
   workload_convergence_poll_seconds: 5
+  workload_convergence_required_namespaces:
+    - vault
+    - postgres
+    - sso
+    - gitea
+    - monitoring
+    - harbor
   ignore_workload_namespaces: []
   ignore_workloads: []
   ignore_unavailable_nodes: []
diff --git a/internal/cluster/orchestrator_access_fluxsource.go b/internal/cluster/orchestrator_access_fluxsource.go
index 22c1947..529a875 100644
--- a/internal/cluster/orchestrator_access_fluxsource.go
+++ b/internal/cluster/orchestrator_access_fluxsource.go
@@ -77,7 +77,7 @@ func (o *Orchestrator) waitForNodeSSHAuth(ctx context.Context, nodes []string) e
 	ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	seen := map[string]struct{}{}
 	targets := make([]string, 0, len(nodes))
-	for _, node := range nodes {
+	for _, node := range startupRequiredNodes(nodes, o.cfg.Startup.NodeSSHAuthRequiredNodes) {
 		node = strings.TrimSpace(node)
 		if node == "" {
 			continue
diff --git a/internal/cluster/orchestrator_critical_vault.go b/internal/cluster/orchestrator_critical_vault.go
index 907c787..c93136e 100644
--- a/internal/cluster/orchestrator_critical_vault.go
+++ b/internal/cluster/orchestrator_critical_vault.go
@@ -227,6 +227,31 @@ func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) er
 	return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
 }
 
+// ensureVaultUnsealedWhenRunnable runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error).
+// Why: lets startup defer vault unseal until the pod is actually runnable, while
+// keeping the direct unseal helper strict for explicit recovery paths and tests.
+func (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error) {
+	if o.runner.DryRun {
+		return false, "", nil
+	}
+
+	phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
+	if err != nil {
+		if isNotFoundErr(err) {
+			return true, "vault-0 pod is not present yet; deferring unseal until critical workload recovery", nil
+		}
+		return false, "", fmt.Errorf("vault pod phase check failed: %w", err)
+	}
+
+	trimmedPhase := strings.TrimSpace(phase)
+	if trimmedPhase != "Running" {
+		return true, fmt.Sprintf("vault-0 pod phase is %q; deferring unseal until critical workload recovery", trimmedPhase), nil
+	}
+
+	return false, "", o.ensureVaultUnsealed(ctx)
+}
+
 // ensureVaultUnsealed runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
diff --git a/internal/cluster/orchestrator_fluxhealth.go b/internal/cluster/orchestrator_fluxhealth.go
index b180056..2574e7f 100644
--- a/internal/cluster/orchestrator_fluxhealth.go
+++ b/internal/cluster/orchestrator_fluxhealth.go
@@ -143,6 +143,8 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
 		return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
 	}
 	ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
+	required := o.startupRequiredFluxKustomizations()
+	requiredSeen := map[string]struct{}{}
 	notReady := []string{}
 	for _, ks := range list.Items {
 		ns := strings.TrimSpace(ks.Metadata.Namespace)
@@ -154,6 +156,12 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
 		if ks.Spec.Suspend {
 			continue
 		}
+		if len(required) > 0 {
+			if _, ok := required[full]; !ok {
+				continue
+			}
+			requiredSeen[full] = struct{}{}
+		}
 		if _, ok := ignored[full]; ok {
 			continue
 		}
@@ -173,10 +181,25 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
 		}
 		notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
 	}
+	if len(required) > 0 {
+		missing := []string{}
+		for full := range required {
+			if _, ok := requiredSeen[full]; !ok {
+				missing = append(missing, full+"(missing)")
+			}
+		}
+		if len(missing) > 0 {
+			sort.Strings(missing)
+			notReady = append(notReady, missing...)
+		}
+	}
 	if len(notReady) > 0 {
 		sort.Strings(notReady)
 		return false, "not ready: " + joinLimited(notReady, 6), nil
 	}
+	if len(required) > 0 {
+		return true, fmt.Sprintf("required kustomizations ready=%d", len(requiredSeen)), nil
+	}
 	return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
 }
 
diff --git a/internal/cluster/orchestrator_ingress.go b/internal/cluster/orchestrator_ingress.go
index beae3c1..134c383 100644
--- a/internal/cluster/orchestrator_ingress.go
+++ b/internal/cluster/orchestrator_ingress.go
@@ -19,6 +19,7 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
 	if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
 		return nil
 	}
+	ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
 	for node := range o.cfg.Startup.RequiredNodeLabels {
 		node = strings.TrimSpace(node)
@@ -28,6 +29,10 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
 	}
 	sort.Strings(nodes)
 	for _, node := range nodes {
+		if _, skip := ignored[node]; skip {
+			o.log.Printf("skipping required node labels for ignored unavailable node %s", node)
+			continue
+		}
 		labels := o.cfg.Startup.RequiredNodeLabels[node]
 		if len(labels) == 0 {
 			continue
@@ -55,6 +60,11 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
 			continue
 		}
 		if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
+			if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {
+				o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)
+				o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))
+				continue
+			}
 			return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
 		}
 		o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))
diff --git a/internal/cluster/orchestrator_lifecycle.go b/internal/cluster/orchestrator_lifecycle.go
index acfa84d..5e849be 100644
--- a/internal/cluster/orchestrator_lifecycle.go
+++ b/internal/cluster/orchestrator_lifecycle.go
@@ -37,14 +37,7 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
 		return invErr
 	}
 	o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
-	if err := o.waitForAPI(ctx, 1, time.Second); err == nil {
-		o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed while kubernetes api is already available")
-		if err := o.ensureVaultUnsealed(ctx); err != nil {
-			o.noteStartupCheck("vault-unseal", false, err.Error())
-			return err
-		}
-		o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
-	}
+	o.maybeRunEarlyVaultUnseal(ctx)
 	o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
 	if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
 		o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
@@ -187,12 +180,9 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
 		}
 	}
 	o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
-	o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
-	if err := o.ensureVaultUnsealed(ctx); err != nil {
-		o.noteStartupCheck("vault-unseal", false, err.Error())
+	if err := o.runStartupVaultUnsealGate(ctx); err != nil {
 		return err
 	}
-	o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
 	if err := o.ensureRequiredNodeLabels(ctx); err != nil {
 		return err
 	}
@@ -490,18 +480,3 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
 	o.log.Printf("shutdown flow complete")
 	return nil
 }
-
-// normalizeShutdownMode runs one orchestration or CLI step.
-// Signature: normalizeShutdownMode(raw string) (string, error).
-// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
-// semantics while preserving compatibility with legacy "config" callers.
-func normalizeShutdownMode(raw string) (string, error) {
-	switch strings.TrimSpace(raw) {
-	case "", "config", "cluster-only":
-		return "cluster-only", nil
-	case "poweroff":
-		return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
-	default:
-		return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
-	}
-}
diff --git a/internal/cluster/orchestrator_node_reachability.go b/internal/cluster/orchestrator_node_reachability.go
index 9209f75..275cc4e 100644
--- a/internal/cluster/orchestrator_node_reachability.go
+++ b/internal/cluster/orchestrator_node_reachability.go
@@ -30,7 +30,7 @@ func (o *Orchestrator) waitForNodeInventoryReachability(ctx context.Context) err
 	ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	targets := make([]string, 0, len(o.inventoryNodesForValidation()))
 	seen := map[string]struct{}{}
-	for _, node := range o.inventoryNodesForValidation() {
+	for _, node := range startupRequiredNodes(o.inventoryNodesForValidation(), o.cfg.Startup.NodeInventoryReachRequiredNodes) {
 		node = strings.TrimSpace(node)
 		if node == "" {
 			continue
diff --git a/internal/cluster/orchestrator_shutdown_mode.go b/internal/cluster/orchestrator_shutdown_mode.go
new file mode 100644
index 0000000..9ab9f69
--- /dev/null
+++ b/internal/cluster/orchestrator_shutdown_mode.go
@@ -0,0 +1,21 @@
+package cluster
+
+import (
+	"fmt"
+	"strings"
+)
+
+// normalizeShutdownMode runs one orchestration or CLI step.
+// Signature: normalizeShutdownMode(raw string) (string, error).
+// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
+// semantics while preserving compatibility with legacy "config" callers.
+func normalizeShutdownMode(raw string) (string, error) {
+	switch strings.TrimSpace(raw) {
+	case "", "config", "cluster-only":
+		return "cluster-only", nil
+	case "poweroff":
+		return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
+	default:
+		return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
+	}
+}
diff --git a/internal/cluster/orchestrator_startup_scope.go b/internal/cluster/orchestrator_startup_scope.go
new file mode 100644
index 0000000..7b6c515
--- /dev/null
+++ b/internal/cluster/orchestrator_startup_scope.go
@@ -0,0 +1,81 @@
+package cluster
+
+import "strings"
+
+// startupRequiredNodes runs one orchestration or CLI step.
+// Signature: startupRequiredNodes(nodes []string, required []string) []string.
+// Why: lets startup enforce a smaller core node set during outage recovery
+// without losing the stricter all-nodes behavior when no override is configured.
+func startupRequiredNodes(nodes []string, required []string) []string {
+	requiredSet := makeStringSet(required)
+	if len(requiredSet) == 0 {
+		return nodes
+	}
+	filtered := make([]string, 0, len(nodes))
+	for _, node := range nodes {
+		node = strings.TrimSpace(node)
+		if node == "" {
+			continue
+		}
+		if _, ok := requiredSet[node]; ok {
+			filtered = append(filtered, node)
+		}
+	}
+	return filtered
+}
+
+// startupNodeStrictlyRequired runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) startupNodeStrictlyRequired(node string) bool.
+// Why: absent or broken non-core nodes should not block recovery-only actions
+// like label reconciliation once the operator has narrowed startup to core nodes.
+func (o *Orchestrator) startupNodeStrictlyRequired(node string) bool {
+	node = strings.TrimSpace(node)
+	if node == "" {
+		return false
+	}
+	if len(o.cfg.Startup.NodeInventoryReachRequiredNodes) == 0 && len(o.cfg.Startup.NodeSSHAuthRequiredNodes) == 0 {
+		return true
+	}
+	for _, controlPlane := range o.cfg.ControlPlanes {
+		if strings.TrimSpace(controlPlane) == node {
+			return true
+		}
+	}
+	if containsNode(o.cfg.Startup.NodeInventoryReachRequiredNodes, node) {
+		return true
+	}
+	return containsNode(o.cfg.Startup.NodeSSHAuthRequiredNodes, node)
+}
+
+// startupRequiredFluxKustomizations runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{}.
+// Why: lets outage recovery wait on a declared core GitOps slice while leaving
+// optional stacks free to converge after bootstrap succeeds.
+func (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{} {
+	return makeStringSet(o.cfg.Startup.FluxHealthRequiredKustomizations)
+}
+
+// startupRequiredWorkloadNamespaces runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{}.
+// Why: keeps workload readiness scoped to core namespaces during recovery while
+// preserving broad convergence checks when no explicit core list is configured.
+func (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{} {
+	return makeStringSet(o.cfg.Startup.WorkloadConvergenceRequiredNamespaces)
+}
+
+// containsNode runs one orchestration or CLI step.
+// Signature: containsNode(entries []string, needle string) bool.
+// Why: keeps node-scope checks small and explicit anywhere startup narrows its
+// recovery gates to a declared core set.
+func containsNode(entries []string, needle string) bool {
+	needle = strings.TrimSpace(needle)
+	if needle == "" {
+		return false
+	}
+	for _, entry := range entries {
+		if strings.TrimSpace(entry) == needle {
+			return true
+		}
+	}
+	return false
+}
diff --git a/internal/cluster/orchestrator_startup_vault.go b/internal/cluster/orchestrator_startup_vault.go
new file mode 100644
index 0000000..0a671c1
--- /dev/null
+++ b/internal/cluster/orchestrator_startup_vault.go
@@ -0,0 +1,52 @@
+package cluster
+
+import (
+	"context"
+	"fmt"
+	"time"
+)
+
+// maybeRunEarlyVaultUnseal runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context).
+// Why: gives startup a best-effort Vault recovery path when the API is already
+// live, without consuming the hard startup failure path before workloads recover.
+func (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context) {
+	if err := o.waitForAPI(ctx, 1, time.Second); err != nil {
+		return
+	}
+
+	o.noteStartupCheckState("vault-unseal-early", "running", "best-effort early vault unseal while kubernetes api is already available")
+	deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
+	if err != nil {
+		o.log.Printf("warning: early vault unseal deferred: %v", err)
+		o.noteStartupAutoHeal(fmt.Sprintf("deferred early vault unseal: %v", err))
+		return
+	}
+	if deferred {
+		o.log.Printf("vault early unseal deferred: %s", detail)
+		o.noteStartupAutoHeal(detail)
+		return
+	}
+	o.noteStartupCheck("vault-unseal-early", true, "vault is already unsealed")
+}
+
+// runStartupVaultUnsealGate runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error.
+// Why: keeps the top-level startup flow readable while allowing Vault unseal to
+// defer cleanly until critical workload recovery when the pod is not runnable yet.
+func (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error {
+	o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
+	deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
+	if err != nil {
+		o.noteStartupCheck("vault-unseal", false, err.Error())
+		return err
+	}
+	if deferred {
+		o.log.Printf("vault unseal deferred until workload recovery: %s", detail)
+		o.noteStartupAutoHeal(detail)
+		o.noteStartupCheck("vault-unseal", true, detail)
+		return nil
+	}
+	o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
+	return nil
+}
diff --git a/internal/cluster/orchestrator_workload_convergence.go b/internal/cluster/orchestrator_workload_convergence.go
index a9bd35f..179b0ca 100644
--- a/internal/cluster/orchestrator_workload_convergence.go
+++ b/internal/cluster/orchestrator_workload_convergence.go
@@ -71,6 +71,7 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
 	if err := json.Unmarshal([]byte(out), &list); err != nil {
 		return false, "", fmt.Errorf("decode controllers: %w", err)
 	}
+	requiredNamespaces := o.startupRequiredWorkloadNamespaces()
 	ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
 	ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
@@ -84,6 +85,11 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
 		if kind == "" || ns == "" || name == "" {
 			continue
 		}
+		if len(requiredNamespaces) > 0 {
+			if _, ok := requiredNamespaces[ns]; !ok {
+				continue
+			}
+		}
 		if _, ok := ignoredNamespaces[ns]; ok {
 			continue
 		}
diff --git a/internal/cluster/orchestrator_workload_ignore.go b/internal/cluster/orchestrator_workload_ignore.go
index 6405731..c286186 100644
--- a/internal/cluster/orchestrator_workload_ignore.go
+++ b/internal/cluster/orchestrator_workload_ignore.go
@@ -116,6 +116,7 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
 		return nil, fmt.Errorf("decode pods: %w", err)
 	}
 
+	requiredNamespaces := o.startupRequiredWorkloadNamespaces()
 	ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
 	ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
 	stuckReasons := map[string]struct{}{
@@ -138,6 +139,11 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
 		if ns == "" || name == "" {
 			continue
 		}
+		if len(requiredNamespaces) > 0 {
+			if _, ok := requiredNamespaces[ns]; !ok {
+				continue
+			}
+		}
 		if _, ok := ignoredNamespaces[ns]; ok {
 			continue
 		}
diff --git a/internal/cluster/testing_hooks_startup.go b/internal/cluster/testing_hooks_startup.go
new file mode 100644
index 0000000..ee32165
--- /dev/null
+++ b/internal/cluster/testing_hooks_startup.go
@@ -0,0 +1,55 @@
+package cluster
+
+import "context"
+
+// TestHookStartupRequiredNodes runs one orchestration or CLI step.
+// Signature: TestHookStartupRequiredNodes(nodes []string, required []string) []string.
+// Why: exposes recovery-scope node filtering so top-level tests can cover core-only startup narrowing.
+func TestHookStartupRequiredNodes(nodes []string, required []string) []string {
+	return startupRequiredNodes(nodes, required)
+}
+
+// TestHookContainsNode runs one orchestration or CLI step.
+// Signature: TestHookContainsNode(entries []string, needle string) bool.
+// Why: exposes the small startup-scope membership helper to top-level tests.
+func TestHookContainsNode(entries []string, needle string) bool {
+	return containsNode(entries, needle)
+}
+
+// TestHookStartupNodeStrictlyRequired runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool.
+// Why: exposes strict-node startup scoping so outage-recovery tests can confirm
+// non-core nodes stop blocking bootstrap.
+func (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool {
+	return o.startupNodeStrictlyRequired(node)
+}
+
+// TestHookStartupRequiredFluxKustomizations runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{}.
+// Why: exposes flux startup scoping so top-level tests can confirm only core
+// kustomizations block emergency bootstrap.
+func (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{} {
+	return o.startupRequiredFluxKustomizations()
+}
+
+// TestHookStartupRequiredWorkloadNamespaces runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{}.
+// Why: exposes workload namespace startup scoping so top-level tests can
+// confirm only core workloads block emergency bootstrap.
+func (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{} {
+	return o.startupRequiredWorkloadNamespaces()
+}
+
+// TestHookMaybeRunEarlyVaultUnseal runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context).
+// Why: exposes the early startup Vault deferral helper to top-level tests.
+func (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context) {
+	o.maybeRunEarlyVaultUnseal(ctx)
+}
+
+// TestHookRunStartupVaultUnsealGate runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error.
+// Why: exposes the startup Vault gate helper to top-level tests.
+func (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error {
+	return o.runStartupVaultUnsealGate(ctx)
+}
diff --git a/internal/config/apply_defaults.go b/internal/config/apply_defaults.go
index 88ebf3b..7363fb6 100644
--- a/internal/config/apply_defaults.go
+++ b/internal/config/apply_defaults.go
@@ -33,6 +33,9 @@ func (c *Config) applyDefaults() {
 	if c.Startup.NodeInventoryReachPollSeconds <= 0 {
 		c.Startup.NodeInventoryReachPollSeconds = 5
 	}
+	if c.Startup.NodeInventoryReachRequiredNodes == nil {
+		c.Startup.NodeInventoryReachRequiredNodes = []string{}
+	}
 	if c.Startup.RequiredNodeLabels == nil {
 		c.Startup.RequiredNodeLabels = map[string]map[string]string{
 			"titan-09": {
@@ -121,7 +124,11 @@ func (c *Config) applyDefaults() {
 	if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
 		c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
 	}
-	c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
+	if c.Startup.ServiceChecklistExplicitOnly {
+		c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, []ServiceChecklistCheck{})
+	} else {
+		c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
+	}
 	for i := range c.Startup.ServiceChecklist {
 		if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
 			c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
@@ -152,12 +159,18 @@ func (c *Config) applyDefaults() {
 	if c.Startup.NodeSSHAuthPollSeconds <= 0 {
 		c.Startup.NodeSSHAuthPollSeconds = 5
 	}
+	if c.Startup.NodeSSHAuthRequiredNodes == nil {
+		c.Startup.NodeSSHAuthRequiredNodes = []string{}
+	}
 	if c.Startup.FluxHealthWaitSeconds <= 0 {
 		c.Startup.FluxHealthWaitSeconds = 900
 	}
 	if c.Startup.FluxHealthPollSeconds <= 0 {
 		c.Startup.FluxHealthPollSeconds = 5
 	}
+	if c.Startup.FluxHealthRequiredKustomizations == nil {
+		c.Startup.FluxHealthRequiredKustomizations = []string{}
+	}
 	if c.Startup.IgnoreFluxKustomizations == nil {
 		c.Startup.IgnoreFluxKustomizations = []string{}
 	}
@@ -167,6 +180,9 @@ func (c *Config) applyDefaults() {
 	if c.Startup.WorkloadConvergencePollSeconds <= 0 {
 		c.Startup.WorkloadConvergencePollSeconds = 5
 	}
+	if c.Startup.WorkloadConvergenceRequiredNamespaces == nil {
+		c.Startup.WorkloadConvergenceRequiredNamespaces = []string{}
+	}
 	if c.Startup.IgnoreWorkloadNamespaces == nil {
 		c.Startup.IgnoreWorkloadNamespaces = []string{}
 	}
diff --git a/internal/config/defaults.go b/internal/config/defaults.go
index b1bcfa4..2d33a39 100644
--- a/internal/config/defaults.go
+++ b/internal/config/defaults.go
@@ -39,24 +39,25 @@ func defaults() Config {
 			"maintenance",
 		},
 		Startup: Startup{
-			APIWaitSeconds:                1200,
-			APIPollSeconds:                2,
-			ShutdownCooldownSeconds:       45,
-			RequireNodeInventoryReach:     true,
-			NodeInventoryReachWaitSeconds: 300,
-			NodeInventoryReachPollSeconds: 5,
-			RequireTimeSync:               true,
-			TimeSyncWaitSeconds:           240,
-			TimeSyncPollSeconds:           5,
-			TimeSyncMode:                  "quorum",
-			TimeSyncQuorum:                2,
-			ReconcileAccessOnBoot:         true,
-			AutoEtcdRestoreOnAPIFailure:   true,
-			EtcdRestoreControlPlane:       "titan-0a",
-			RequireStorageReady:           true,
-			StorageReadyWaitSeconds:       420,
-			StorageReadyPollSeconds:       5,
-			StorageMinReadyNodes:          2,
+			APIWaitSeconds:                  1200,
+			APIPollSeconds:                  2,
+			ShutdownCooldownSeconds:         45,
+			RequireNodeInventoryReach:       true,
+			NodeInventoryReachWaitSeconds:   300,
+			NodeInventoryReachPollSeconds:   5,
+			NodeInventoryReachRequiredNodes: []string{},
+			RequireTimeSync:                 true,
+			TimeSyncWaitSeconds:             240,
+			TimeSyncPollSeconds:             5,
+			TimeSyncMode:                    "quorum",
+			TimeSyncQuorum:                  2,
+			ReconcileAccessOnBoot:           true,
+			AutoEtcdRestoreOnAPIFailure:     true,
+			EtcdRestoreControlPlane:         "titan-0a",
+			RequireStorageReady:             true,
+			StorageReadyWaitSeconds:         420,
+			StorageReadyPollSeconds:         5,
+			StorageMinReadyNodes:            2,
 			StorageCriticalPVCs: []string{
 				"vault/data-vault-0",
 				"postgres/postgres-data-postgres-0",
@@ -91,33 +92,36 @@ func defaults() Config {
 				AdminSecretUsernameKey: "username",
 				AdminSecretPasswordKey: "password",
 			},
-			ServiceChecklist:                defaultServiceChecklist(),
-			RequireCriticalServiceEndpoints: true,
-			CriticalServiceEndpointWaitSec:  420,
-			CriticalServiceEndpointPollSec:  5,
-			CriticalServiceEndpoints:        defaultCriticalServiceEndpoints(),
-			RequireIngressChecklist:         true,
-			IngressChecklistWaitSeconds:     420,
-			IngressChecklistPollSeconds:     5,
-			IngressChecklistAccepted:        []int{200, 301, 302, 307, 308, 401, 403, 404},
-			IngressChecklistIgnoreHosts:     []string{},
-			RequireNodeSSHAuth:              true,
-			NodeSSHAuthWaitSeconds:          240,
-			NodeSSHAuthPollSeconds:          5,
-			RequireFluxHealth:               true,
-			FluxHealthWaitSeconds:           900,
-			FluxHealthPollSeconds:           5,
-			IgnoreFluxKustomizations:        []string{},
-			RequireWorkloadConvergence:      true,
-			WorkloadConvergenceWaitSeconds:  900,
-			WorkloadConvergencePollSeconds:  5,
-			IgnoreWorkloadNamespaces:        []string{},
-			IgnoreWorkloads:                 []string{},
-			IgnoreUnavailableNodes:          []string{},
-			AutoRecycleStuckPods:            true,
-			StuckPodGraceSeconds:            180,
-			VaultUnsealKeyFile:              "/var/lib/ananke/vault-unseal.key",
-			VaultUnsealBreakglassTimeout:    15,
+			ServiceChecklist:                      defaultServiceChecklist(),
+			RequireCriticalServiceEndpoints:       true,
+			CriticalServiceEndpointWaitSec:        420,
+			CriticalServiceEndpointPollSec:        5,
+			CriticalServiceEndpoints:              defaultCriticalServiceEndpoints(),
+			RequireIngressChecklist:               true,
+			IngressChecklistWaitSeconds:           420,
+			IngressChecklistPollSeconds:           5,
+			IngressChecklistAccepted:              []int{200, 301, 302, 307, 308, 401, 403, 404},
+			IngressChecklistIgnoreHosts:           []string{},
+			RequireNodeSSHAuth:                    true,
+			NodeSSHAuthWaitSeconds:                240,
+			NodeSSHAuthPollSeconds:                5,
+			NodeSSHAuthRequiredNodes:              []string{},
+			RequireFluxHealth:                     true,
+			FluxHealthWaitSeconds:                 900,
+			FluxHealthPollSeconds:                 5,
+			FluxHealthRequiredKustomizations:      []string{},
+			IgnoreFluxKustomizations:              []string{},
+			RequireWorkloadConvergence:            true,
+			WorkloadConvergenceWaitSeconds:        900,
+			WorkloadConvergencePollSeconds:        5,
+			WorkloadConvergenceRequiredNamespaces: []string{},
+			IgnoreWorkloadNamespaces:              []string{},
+			IgnoreWorkloads:                       []string{},
+			IgnoreUnavailableNodes:                []string{},
+			AutoRecycleStuckPods:                  true,
+			StuckPodGraceSeconds:                  180,
+			VaultUnsealKeyFile:                    "/var/lib/ananke/vault-unseal.key",
+			VaultUnsealBreakglassTimeout:          15,
 		},
 		Shutdown: Shutdown{
 			DefaultBudgetSeconds: 1380,
diff --git a/internal/config/load_additional_test.go b/internal/config/load_additional_test.go
index ff73f26..dd85311 100644
--- a/internal/config/load_additional_test.go
+++ b/internal/config/load_additional_test.go
@@ -51,3 +51,41 @@ startup:
 		t.Fatalf("expected validation failure")
 	}
 }
+
+// TestLoadKeepsExplicitServiceChecklist runs one orchestration or CLI step.
+// Signature: TestLoadKeepsExplicitServiceChecklist(t *testing.T).
+// Why: host recovery configs must be able to keep a narrow, explicit checklist
+// without silently inheriting the full default service catalog.
+func TestLoadKeepsExplicitServiceChecklist(t *testing.T) {
+	cfgPath := filepath.Join(t.TempDir(), "ananke.yaml")
+	raw := `
+control_planes: [titan-0a]
+expected_flux_branch: main
+expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
+iac_repo_path: /opt/titan-iac
+startup:
+  service_checklist_explicit_only: true
+  service_checklist:
+    - name: gitea-api
+      url: https://scm.bstein.dev/api/healthz
+      accepted_statuses: [200]
+      body_contains: pass
+      timeout_seconds: 12
+ups:
+  enabled: false
+`
+	if err := os.WriteFile(cfgPath, []byte(raw), 0o644); err != nil {
+		t.Fatalf("write config: %v", err)
+	}
+
+	cfg, err := Load(cfgPath)
+	if err != nil {
+		t.Fatalf("load config: %v", err)
+	}
+	if len(cfg.Startup.ServiceChecklist) != 1 {
+		t.Fatalf("expected 1 explicit service check, got %d", len(cfg.Startup.ServiceChecklist))
+	}
+	if cfg.Startup.ServiceChecklist[0].Name != "gitea-api" {
+		t.Fatalf("expected explicit gitea-api check, got %q", cfg.Startup.ServiceChecklist[0].Name)
+	}
+}
diff --git a/internal/config/types.go b/internal/config/types.go
index 5b74c98..1ae797c 100644
--- a/internal/config/types.go
+++ b/internal/config/types.go
@@ -27,65 +27,70 @@ type Config struct {
 }
 
 type Startup struct {
-	APIWaitSeconds                  int                          `yaml:"api_wait_seconds"`
-	APIPollSeconds                  int                          `yaml:"api_poll_seconds"`
-	ShutdownCooldownSeconds         int                          `yaml:"shutdown_cooldown_seconds"`
-	MinimumBatteryPercent           float64                      `yaml:"minimum_battery_percent"`
-	RequireNodeInventoryReach       bool                         `yaml:"require_node_inventory_reachability"`
-	NodeInventoryReachWaitSeconds   int                          `yaml:"node_inventory_reachability_wait_seconds"`
-	NodeInventoryReachPollSeconds   int                          `yaml:"node_inventory_reachability_poll_seconds"`
-	RequiredNodeLabels              map[string]map[string]string `yaml:"required_node_labels"`
-	RequireTimeSync                 bool                         `yaml:"require_time_sync"`
-	TimeSyncWaitSeconds             int                          `yaml:"time_sync_wait_seconds"`
-	TimeSyncPollSeconds             int                          `yaml:"time_sync_poll_seconds"`
-	TimeSyncMode                    string                       `yaml:"time_sync_mode"`
-	TimeSyncQuorum                  int                          `yaml:"time_sync_quorum"`
-	ReconcileAccessOnBoot           bool                         `yaml:"reconcile_access_on_boot"`
-	AutoEtcdRestoreOnAPIFailure     bool                         `yaml:"auto_etcd_restore_on_api_failure"`
-	EtcdRestoreControlPlane         string                       `yaml:"etcd_restore_control_plane"`
-	RequireStorageReady             bool                         `yaml:"require_storage_ready"`
-	StorageReadyWaitSeconds         int                          `yaml:"storage_ready_wait_seconds"`
-	StorageReadyPollSeconds         int                          `yaml:"storage_ready_poll_seconds"`
-	StorageMinReadyNodes            int                          `yaml:"storage_min_ready_nodes"`
-	StorageCriticalPVCs             []string                     `yaml:"storage_critical_pvcs"`
-	RequirePostStartProbes          bool                         `yaml:"require_post_start_probes"`
-	PostStartProbeWaitSeconds       int                          `yaml:"post_start_probe_wait_seconds"`
-	PostStartProbePollSeconds       int                          `yaml:"post_start_probe_poll_seconds"`
-	PostStartProbes                 []string                     `yaml:"post_start_probes"`
-	RequireServiceChecklist         bool                         `yaml:"require_service_checklist"`
-	ServiceChecklistWaitSeconds     int                          `yaml:"service_checklist_wait_seconds"`
-	ServiceChecklistPollSeconds     int                          `yaml:"service_checklist_poll_seconds"`
-	ServiceChecklistStabilitySec    int                          `yaml:"service_checklist_stability_seconds"`
-	ServiceChecklistAuth            ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
-	ServiceChecklist                []ServiceChecklistCheck      `yaml:"service_checklist"`
-	RequireCriticalServiceEndpoints bool                         `yaml:"require_critical_service_endpoints"`
-	CriticalServiceEndpointWaitSec  int                          `yaml:"critical_service_endpoint_wait_seconds"`
-	CriticalServiceEndpointPollSec  int                          `yaml:"critical_service_endpoint_poll_seconds"`
-	CriticalServiceEndpoints        []string                     `yaml:"critical_service_endpoints"`
-	RequireIngressChecklist         bool                         `yaml:"require_ingress_checklist"`
-	IngressChecklistWaitSeconds     int                          `yaml:"ingress_checklist_wait_seconds"`
-	IngressChecklistPollSeconds     int                          `yaml:"ingress_checklist_poll_seconds"`
-	IngressChecklistAccepted        []int                        `yaml:"ingress_checklist_accepted_statuses"`
-	IngressChecklistIgnoreHosts     []string                     `yaml:"ingress_checklist_ignore_hosts"`
-	IngressChecklistInsecureSkip    bool                         `yaml:"ingress_checklist_insecure_skip_tls"`
-	RequireNodeSSHAuth              bool                         `yaml:"require_node_ssh_auth"`
-	NodeSSHAuthWaitSeconds          int                          `yaml:"node_ssh_auth_wait_seconds"`
-	NodeSSHAuthPollSeconds          int                          `yaml:"node_ssh_auth_poll_seconds"`
-	RequireFluxHealth               bool                         `yaml:"require_flux_health"`
-	FluxHealthWaitSeconds           int                          `yaml:"flux_health_wait_seconds"`
-	FluxHealthPollSeconds           int                          `yaml:"flux_health_poll_seconds"`
-	IgnoreFluxKustomizations        []string                     `yaml:"ignore_flux_kustomizations"`
-	RequireWorkloadConvergence      bool                         `yaml:"require_workload_convergence"`
-	WorkloadConvergenceWaitSeconds  int                          `yaml:"workload_convergence_wait_seconds"`
-	WorkloadConvergencePollSeconds  int                          `yaml:"workload_convergence_poll_seconds"`
-	IgnoreWorkloadNamespaces        []string                     `yaml:"ignore_workload_namespaces"`
-	IgnoreWorkloads                 []string                     `yaml:"ignore_workloads"`
-	IgnoreUnavailableNodes          []string                     `yaml:"ignore_unavailable_nodes"`
-	AutoRecycleStuckPods            bool                         `yaml:"auto_recycle_stuck_pods"`
-	StuckPodGraceSeconds            int                          `yaml:"stuck_pod_grace_seconds"`
-	VaultUnsealKeyFile              string                       `yaml:"vault_unseal_key_file"`
-	VaultUnsealBreakglassCommand    string                       `yaml:"vault_unseal_breakglass_command"`
-	VaultUnsealBreakglassTimeout    int                          `yaml:"vault_unseal_breakglass_timeout_seconds"`
+	APIWaitSeconds                        int                          `yaml:"api_wait_seconds"`
+	APIPollSeconds                        int                          `yaml:"api_poll_seconds"`
+	ShutdownCooldownSeconds               int                          `yaml:"shutdown_cooldown_seconds"`
+	MinimumBatteryPercent                 float64                      `yaml:"minimum_battery_percent"`
+	RequireNodeInventoryReach             bool                         `yaml:"require_node_inventory_reachability"`
+	NodeInventoryReachWaitSeconds         int                          `yaml:"node_inventory_reachability_wait_seconds"`
+	NodeInventoryReachPollSeconds         int                          `yaml:"node_inventory_reachability_poll_seconds"`
+	NodeInventoryReachRequiredNodes       []string                     `yaml:"node_inventory_reachability_required_nodes"`
+	RequiredNodeLabels                    map[string]map[string]string `yaml:"required_node_labels"`
+	RequireTimeSync                       bool                         `yaml:"require_time_sync"`
+	TimeSyncWaitSeconds                   int                          `yaml:"time_sync_wait_seconds"`
+	TimeSyncPollSeconds                   int                          `yaml:"time_sync_poll_seconds"`
+	TimeSyncMode                          string                       `yaml:"time_sync_mode"`
+	TimeSyncQuorum                        int                          `yaml:"time_sync_quorum"`
+	ReconcileAccessOnBoot                 bool                         `yaml:"reconcile_access_on_boot"`
+	AutoEtcdRestoreOnAPIFailure           bool                         `yaml:"auto_etcd_restore_on_api_failure"`
+	EtcdRestoreControlPlane               string                       `yaml:"etcd_restore_control_plane"`
+	RequireStorageReady                   bool                         `yaml:"require_storage_ready"`
+	StorageReadyWaitSeconds               int                          `yaml:"storage_ready_wait_seconds"`
+	StorageReadyPollSeconds               int                          `yaml:"storage_ready_poll_seconds"`
+	StorageMinReadyNodes                  int                          `yaml:"storage_min_ready_nodes"`
+	StorageCriticalPVCs                   []string                     `yaml:"storage_critical_pvcs"`
+	RequirePostStartProbes                bool                         `yaml:"require_post_start_probes"`
+	PostStartProbeWaitSeconds             int                          `yaml:"post_start_probe_wait_seconds"`
+	PostStartProbePollSeconds             int                          `yaml:"post_start_probe_poll_seconds"`
+	PostStartProbes                       []string                     `yaml:"post_start_probes"`
+	RequireServiceChecklist               bool                         `yaml:"require_service_checklist"`
+	ServiceChecklistWaitSeconds           int                          `yaml:"service_checklist_wait_seconds"`
+	ServiceChecklistPollSeconds           int                          `yaml:"service_checklist_poll_seconds"`
+	ServiceChecklistStabilitySec          int                          `yaml:"service_checklist_stability_seconds"`
+	ServiceChecklistAuth                  ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
+	ServiceChecklistExplicitOnly          bool                         `yaml:"service_checklist_explicit_only"`
+	ServiceChecklist                      []ServiceChecklistCheck      `yaml:"service_checklist"`
+	RequireCriticalServiceEndpoints       bool                         `yaml:"require_critical_service_endpoints"`
+	CriticalServiceEndpointWaitSec        int                          `yaml:"critical_service_endpoint_wait_seconds"`
+	CriticalServiceEndpointPollSec        int                          `yaml:"critical_service_endpoint_poll_seconds"`
+	CriticalServiceEndpoints              []string                     `yaml:"critical_service_endpoints"`
+	RequireIngressChecklist               bool                         `yaml:"require_ingress_checklist"`
+	IngressChecklistWaitSeconds           int                          `yaml:"ingress_checklist_wait_seconds"`
+	IngressChecklistPollSeconds           int                          `yaml:"ingress_checklist_poll_seconds"`
+	IngressChecklistAccepted              []int                        `yaml:"ingress_checklist_accepted_statuses"`
+	IngressChecklistIgnoreHosts           []string                     `yaml:"ingress_checklist_ignore_hosts"`
+	IngressChecklistInsecureSkip          bool                         `yaml:"ingress_checklist_insecure_skip_tls"`
+	RequireNodeSSHAuth                    bool                         `yaml:"require_node_ssh_auth"`
+	NodeSSHAuthWaitSeconds                int                          `yaml:"node_ssh_auth_wait_seconds"`
+	NodeSSHAuthPollSeconds                int                          `yaml:"node_ssh_auth_poll_seconds"`
+	NodeSSHAuthRequiredNodes              []string                     `yaml:"node_ssh_auth_required_nodes"`
+	RequireFluxHealth                     bool                         `yaml:"require_flux_health"`
+	FluxHealthWaitSeconds                 int                          `yaml:"flux_health_wait_seconds"`
+	FluxHealthPollSeconds                 int                          `yaml:"flux_health_poll_seconds"`
+	FluxHealthRequiredKustomizations      []string                     `yaml:"flux_health_required_kustomizations"`
+	IgnoreFluxKustomizations              []string                     `yaml:"ignore_flux_kustomizations"`
+	RequireWorkloadConvergence            bool                         `yaml:"require_workload_convergence"`
+	WorkloadConvergenceWaitSeconds        int                          `yaml:"workload_convergence_wait_seconds"`
+	WorkloadConvergencePollSeconds        int                          `yaml:"workload_convergence_poll_seconds"`
+	WorkloadConvergenceRequiredNamespaces []string                     `yaml:"workload_convergence_required_namespaces"`
+	IgnoreWorkloadNamespaces              []string                     `yaml:"ignore_workload_namespaces"`
+	IgnoreWorkloads                       []string                     `yaml:"ignore_workloads"`
+	IgnoreUnavailableNodes                []string                     `yaml:"ignore_unavailable_nodes"`
+	AutoRecycleStuckPods                  bool                         `yaml:"auto_recycle_stuck_pods"`
+	StuckPodGraceSeconds                  int                          `yaml:"stuck_pod_grace_seconds"`
+	VaultUnsealKeyFile                    string                       `yaml:"vault_unseal_key_file"`
+	VaultUnsealBreakglassCommand          string                       `yaml:"vault_unseal_breakglass_command"`
+	VaultUnsealBreakglassTimeout          int                          `yaml:"vault_unseal_breakglass_timeout_seconds"`
 }
 
 type ServiceChecklistCheck struct {
diff --git a/internal/config/validate.go b/internal/config/validate.go
index 9030bd8..e1123e9 100644
--- a/internal/config/validate.go
+++ b/internal/config/validate.go
@@ -61,6 +61,11 @@ func (c Config) Validate() error {
 	if c.Startup.NodeInventoryReachPollSeconds <= 0 {
 		return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
 	}
+	for _, node := range c.Startup.NodeInventoryReachRequiredNodes {
+		if strings.TrimSpace(node) == "" {
+			return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
+		}
+	}
 	for node, labels := range c.Startup.RequiredNodeLabels {
 		if strings.TrimSpace(node) == "" {
 			return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
@@ -233,18 +238,37 @@ func (c Config) Validate() error {
 	if c.Startup.NodeSSHAuthPollSeconds <= 0 {
 		return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
 	}
+	for _, node := range c.Startup.NodeSSHAuthRequiredNodes {
+		if strings.TrimSpace(node) == "" {
+			return fmt.Errorf("config.startup.node_ssh_auth_required_nodes entries must not be empty")
+		}
+	}
 	if c.Startup.FluxHealthWaitSeconds <= 0 {
 		return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
 	}
 	if c.Startup.FluxHealthPollSeconds <= 0 {
 		return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
 	}
+	for _, item := range c.Startup.FluxHealthRequiredKustomizations {
+		item = strings.TrimSpace(item)
+		if item == "" {
+			return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must not be empty")
+		}
+		if strings.Count(item, "/") != 1 {
+			return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must be namespace/name, got %q", item)
+		}
+	}
 	if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
 		return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
 	}
 	if c.Startup.WorkloadConvergencePollSeconds <= 0 {
 		return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
 	}
+	for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
+		if strings.TrimSpace(ns) == "" {
+			return fmt.Errorf("config.startup.workload_convergence_required_namespaces entries must not be empty")
+		}
+	}
 	if c.Startup.StuckPodGraceSeconds <= 0 {
 		return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
 	}
@@ -277,6 +301,16 @@ func (c Config) Validate() error {
 			return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
 		}
 	}
+	for _, item := range c.Startup.FluxHealthRequiredKustomizations {
+		if containsTrimmed(c.Startup.IgnoreFluxKustomizations, item) {
+			return fmt.Errorf("config.startup.flux_health_required_kustomizations must not overlap ignore_flux_kustomizations (%q)", strings.TrimSpace(item))
+		}
+	}
+	for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
+		if containsTrimmed(c.Startup.IgnoreWorkloadNamespaces, ns) {
+			return fmt.Errorf("config.startup.workload_convergence_required_namespaces must not overlap ignore_workload_namespaces (%q)", strings.TrimSpace(ns))
+		}
+	}
 	for _, node := range c.Startup.IgnoreUnavailableNodes {
 		if strings.TrimSpace(node) == "" {
 			return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
@@ -328,3 +362,20 @@ func (c Config) Validate() error {
 	}
 	return nil
 }
+
+// containsTrimmed runs one orchestration or CLI step.
+// Signature: containsTrimmed(entries []string, needle string) bool.
+// Why: startup config now supports both required and ignored recovery scopes, so
+// validation needs a single normalized overlap check for those lists.
+func containsTrimmed(entries []string, needle string) bool {
+	needle = strings.TrimSpace(needle)
+	if needle == "" {
+		return false
+	}
+	for _, entry := range entries {
+		if strings.TrimSpace(entry) == needle {
+			return true
+		}
+	}
+	return false
+}
diff --git a/internal/config/validate_matrix_test.go b/internal/config/validate_matrix_test.go
index bc24d81..1d746f9 100644
--- a/internal/config/validate_matrix_test.go
+++ b/internal/config/validate_matrix_test.go
@@ -30,6 +30,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
 		{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
 		{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
 		{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
+		{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
 		{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
 		{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
 		{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
@@ -68,15 +69,27 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
 		{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
 		{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
 		{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
+		{"bad_empty_node_ssh_required_node", func(c *Config) { c.Startup.NodeSSHAuthRequiredNodes = []string{"titan-0a", ""} }},
 		{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
 		{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
+		{"bad_empty_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", ""} }},
+		{"bad_malformed_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system-core"} }},
 		{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
 		{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
+		{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
 		{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
 		{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
 		{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
 		{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
 		{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
+		{"bad_overlap_flux_required_and_ignored", func(c *Config) {
+			c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core"}
+			c.Startup.IgnoreFluxKustomizations = []string{"flux-system/core"}
+		}},
+		{"bad_overlap_workload_required_and_ignored", func(c *Config) {
+			c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
+			c.Startup.IgnoreWorkloadNamespaces = []string{"monitoring"}
+		}},
 		{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
 		{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
 		{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
@@ -121,6 +134,10 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) {
 	if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
 		t.Fatalf("expected startup defaults to be set")
 	}
+	if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
+		cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
+		t.Fatalf("expected startup recovery scope slices to be initialized")
+	}
 	if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
 		t.Fatalf("expected critical service endpoint timing defaults to be set")
 	}
diff --git a/testing/orchestrator/hooks_flux_workload_matrix_test.go b/testing/orchestrator/hooks_flux_workload_matrix_test.go
index 3cf7a22..4f93ecb 100644
--- a/testing/orchestrator/hooks_flux_workload_matrix_test.go
+++ b/testing/orchestrator/hooks_flux_workload_matrix_test.go
@@ -79,6 +79,29 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
 		}
 	})
 
+	t.Run("flux-health-required-kustomizations-ignore-optional-failures", func(t *testing.T) {
+		cfg := lifecycleConfig(t)
+		cfg.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/monitoring"}
+
+		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+			command := name + " " + strings.Join(args, " ")
+			switch {
+			case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
+				return `{"items":[
+{"metadata":{"namespace":"flux-system","name":"monitoring"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}},
+{"metadata":{"namespace":"flux-system","name":"jellyfin"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"optional still down"}]}}
+]}`, nil
+			default:
+				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+			}
+		}
+		orch, _ := newHookOrchestrator(t, cfg, run, run)
+		ready, detail, err := orch.TestHookFluxHealthReady(context.Background())
+		if err != nil || !ready || !strings.Contains(detail, "required kustomizations ready=") {
+			t.Fatalf("expected required flux kustomization scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
+		}
+	})
+
 	t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		cfg.Startup.WorkloadConvergenceWaitSeconds = 1
@@ -145,6 +168,42 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
 		}
 	})
 
+	t.Run("required-workload-namespaces-ignore-optional-failure-pods", func(t *testing.T) {
+		cfg := lifecycleConfig(t)
+		cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
+		cfg.Startup.StuckPodGraceSeconds = 1
+
+		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+			command := name + " " + strings.Join(args, " ")
+			switch {
+			case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
+				return `{"items":[
+{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
+{"kind":"Deployment","metadata":{"namespace":"jellyfin","name":"pegasus"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
+]}`, nil
+			case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
+				return `{"items":[
+{"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"grafana"}]},"status":{"containerStatuses":[{"state":{"running":{}}}]}},
+{"metadata":{"namespace":"jellyfin","name":"pegasus-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"pegasus"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"pegasus"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}
+]}`, nil
+			default:
+				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+			}
+		}
+		orch, _ := newHookOrchestrator(t, cfg, run, run)
+		ready, detail, err := orch.TestHookWorkloadConvergenceReady(context.Background())
+		if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
+			t.Fatalf("expected required workload namespace scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
+		}
+		failures, err := orch.TestHookStartupFailurePods(context.Background())
+		if err != nil {
+			t.Fatalf("startup failure pod query: %v", err)
+		}
+		if len(failures) != 0 {
+			t.Fatalf("expected optional namespace failures to be ignored, got %v", failures)
+		}
+	})
+
 	t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
 		cfg := lifecycleConfig(t)
 		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
diff --git a/testing/orchestrator/hooks_ingress_service_matrix_test.go b/testing/orchestrator/hooks_ingress_service_matrix_test.go
index 55275f7..a05ae34 100644
--- a/testing/orchestrator/hooks_ingress_service_matrix_test.go
+++ b/testing/orchestrator/hooks_ingress_service_matrix_test.go
@@ -53,6 +53,48 @@ func TestHookIngressServiceMatrix(t *testing.T) {
 		}
 	})
 
+	t.Run("required-node-labels-skip-ignored-unavailable-nodes", func(t *testing.T) {
+		cfg := lifecycleConfig(t)
+		cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
+			"titan-09": {
+				"ananke.bstein.dev/harbor-bootstrap": "true",
+			},
+		}
+		cfg.Startup.IgnoreUnavailableNodes = []string{"titan-09"}
+		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+			command := name + " " + strings.Join(args, " ")
+			if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
+				t.Fatalf("expected ignored unavailable node labels to be skipped, got %q", command)
+			}
+			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+		}
+		orch, _ := newHookOrchestrator(t, cfg, run, run)
+		if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
+			t.Fatalf("expected ignored unavailable node label enforcement to be skipped, got %v", err)
+		}
+	})
+
+	t.Run("required-node-labels-skip-absent-non-core-nodes", func(t *testing.T) {
+		cfg := lifecycleConfig(t)
+		cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
+			"titan-09": {
+				"ananke.bstein.dev/harbor-bootstrap": "true",
+			},
+		}
+		cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
+		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+			command := name + " " + strings.Join(args, " ")
+			if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
+				return "", errors.New("Error from server (NotFound): nodes \"titan-09\" not found")
+			}
+			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+		}
+		orch, _ := newHookOrchestrator(t, cfg, run, run)
+		if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
+			t.Fatalf("expected absent non-core node label enforcement to be skipped, got %v", err)
+		}
+	})
+
 	t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
 		tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 			w.WriteHeader(http.StatusOK)
diff --git a/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go b/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go
index 71669a6..411b374 100644
--- a/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go
+++ b/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go
@@ -633,7 +633,7 @@ func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
 						switch {
 						case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
 							apiVersionCalls++
-							if apiVersionCalls == 1 {
+							if apiVersionCalls <= 2 {
 								return "", errors.New("api down")
 							}
 							return "v1.31.0", nil
diff --git a/testing/orchestrator/hooks_startup_scope_vault_test.go b/testing/orchestrator/hooks_startup_scope_vault_test.go
new file mode 100644
index 0000000..6d9b0fe
--- /dev/null
+++ b/testing/orchestrator/hooks_startup_scope_vault_test.go
@@ -0,0 +1,222 @@
+package orchestrator
+
+import (
+	"context"
+	"errors"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	"scm.bstein.dev/bstein/ananke/internal/cluster"
+)
+
+// readStartupProgress runs one orchestration or CLI step.
+// Signature: readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string.
+// Why: startup helper tests need to inspect progress artifacts without reaching
+// into internal package state from the top-level testing module.
+func readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string {
+	t.Helper()
+	payload, err := os.ReadFile(orch.TestHookStartupProgressPath())
+	if err != nil {
+		t.Fatalf("read startup progress: %v", err)
+	}
+	return string(payload)
+}
+
+// TestHookStartupScopeAndVaultHelpers runs one orchestration or CLI step.
+// Signature: TestHookStartupScopeAndVaultHelpers(t *testing.T).
+// Why: keeps startup-scope and startup-Vault helper branches covered from the
+// split top-level testing module required by the repo hygiene contract.
+func TestHookStartupScopeAndVaultHelpers(t *testing.T) {
+	t.Run("startup-scope-helpers", func(t *testing.T) {
+		nodes := []string{"titan-db", " titan-23 ", "", "titan-24"}
+		if got := cluster.TestHookStartupRequiredNodes(nodes, nil); len(got) != len(nodes) {
+			t.Fatalf("expected passthrough node list, got %v", got)
+		}
+		got := cluster.TestHookStartupRequiredNodes(nodes, []string{"titan-db", "titan-24"})
+		if len(got) != 2 || got[0] != "titan-db" || got[1] != "titan-24" {
+			t.Fatalf("unexpected filtered node list: %v", got)
+		}
+
+		if !cluster.TestHookContainsNode([]string{"titan-db", " titan-23 "}, " titan-23 ") {
+			t.Fatalf("expected trimmed node membership match")
+		}
+		if cluster.TestHookContainsNode([]string{"titan-db"}, " ") {
+			t.Fatalf("expected blank node probe to be ignored")
+		}
+
+		cfg := lifecycleConfig(t)
+		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
+		if !orch.TestHookStartupNodeStrictlyRequired("titan-23") {
+			t.Fatalf("expected all nodes to be strict when no recovery scopes are configured")
+		}
+
+		cfgScoped := lifecycleConfig(t)
+		cfgScoped.Startup.NodeInventoryReachRequiredNodes = []string{"titan-23"}
+		cfgScoped.Startup.NodeSSHAuthRequiredNodes = []string{"titan-24"}
+		cfgScoped.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", " flux-system/gitea "}
+		cfgScoped.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault", " monitoring "}
+		orchScoped, _ := newHookOrchestrator(t, cfgScoped, nil, nil)
+		if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-db") {
+			t.Fatalf("expected control plane to remain strict")
+		}
+		if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-23") {
+			t.Fatalf("expected inventory-scoped node to remain strict")
+		}
+		if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-24") {
+			t.Fatalf("expected ssh-scoped node to remain strict")
+		}
+		if orchScoped.TestHookStartupNodeStrictlyRequired("titan-25") {
+			t.Fatalf("expected non-core worker to stop being strict")
+		}
+
+		flux := orchScoped.TestHookStartupRequiredFluxKustomizations()
+		if _, ok := flux["flux-system/core"]; !ok {
+			t.Fatalf("expected core flux kustomization in required set: %v", flux)
+		}
+		if _, ok := flux["flux-system/gitea"]; !ok {
+			t.Fatalf("expected trimmed gitea kustomization in required set: %v", flux)
+		}
+
+		namespaces := orchScoped.TestHookStartupRequiredWorkloadNamespaces()
+		if _, ok := namespaces["vault"]; !ok {
+			t.Fatalf("expected vault namespace in required set: %v", namespaces)
+		}
+		if _, ok := namespaces["monitoring"]; !ok {
+			t.Fatalf("expected trimmed monitoring namespace in required set: %v", namespaces)
+		}
+	})
+
+	t.Run("startup-vault-helpers", func(t *testing.T) {
+		t.Run("early-vault-unseal-paths", func(t *testing.T) {
+			cfgAPI := lifecycleConfig(t)
+			runAPI := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+				command := name + " " + strings.Join(args, " ")
+				if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
+					return "", errors.New("api down")
+				}
+				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+			}
+			orchAPI, _ := newHookOrchestrator(t, cfgAPI, runAPI, runAPI)
+			orchAPI.TestHookBeginStartupReport("startup-vault")
+			orchAPI.TestHookMaybeRunEarlyVaultUnseal(context.Background())
+			if payload := readStartupProgress(t, orchAPI); strings.Contains(payload, "vault-unseal-early") {
+				t.Fatalf("expected no early vault check when api is unavailable, payload=%s", payload)
+			}
+
+			cfgErr := lifecycleConfig(t)
+			runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+				command := name + " " + strings.Join(args, " ")
+				switch {
+				case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
+					return "v1.31.0", nil
+				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
+					return "", errors.New("phase probe failed")
+				default:
+					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+				}
+			}
+			orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
+			orchErr.TestHookBeginStartupReport("startup-vault")
+			orchErr.TestHookMaybeRunEarlyVaultUnseal(context.Background())
+			if payload := readStartupProgress(t, orchErr); !strings.Contains(payload, "deferred early vault unseal") {
+				t.Fatalf("expected early vault auto-heal detail, payload=%s", payload)
+			}
+
+			cfgDeferred := lifecycleConfig(t)
+			runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+				command := name + " " + strings.Join(args, " ")
+				switch {
+				case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
+					return "v1.31.0", nil
+				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
+					return "Pending", nil
+				default:
+					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+				}
+			}
+			orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
+			orchDeferred.TestHookBeginStartupReport("startup-vault")
+			orchDeferred.TestHookMaybeRunEarlyVaultUnseal(context.Background())
+			if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
+				t.Fatalf("expected deferred early vault detail, payload=%s", payload)
+			}
+
+			cfgSuccess := lifecycleConfig(t)
+			runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+				command := name + " " + strings.Join(args, " ")
+				switch {
+				case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
+					return "v1.31.0", nil
+				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
+					return "Running", nil
+				case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
+					return `{"sealed":false,"initialized":true}`, nil
+				default:
+					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+				}
+			}
+			orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
+			orchSuccess.TestHookBeginStartupReport("startup-vault")
+			orchSuccess.TestHookMaybeRunEarlyVaultUnseal(context.Background())
+			if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault-unseal-early"`) || !strings.Contains(payload, `"passed"`) {
+				t.Fatalf("expected successful early vault check, payload=%s", payload)
+			}
+		})
+
+		t.Run("startup-vault-gate-paths", func(t *testing.T) {
+			cfgErr := lifecycleConfig(t)
+			runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+				command := name + " " + strings.Join(args, " ")
+				if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
+					return "", errors.New("phase probe failed")
+				}
+				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+			}
+			orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
+			orchErr.TestHookBeginStartupReport("startup-vault")
+			if err := orchErr.TestHookRunStartupVaultUnsealGate(context.Background()); err == nil || !strings.Contains(err.Error(), "phase probe failed") {
+				t.Fatalf("expected startup vault gate error, got %v", err)
+			}
+
+			cfgDeferred := lifecycleConfig(t)
+			runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+				command := name + " " + strings.Join(args, " ")
+				if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
+					return "Pending", nil
+				}
+				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+			}
+			orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
+			orchDeferred.TestHookBeginStartupReport("startup-vault")
+			if err := orchDeferred.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
+				t.Fatalf("expected deferred startup vault gate to succeed, got %v", err)
+			}
+			if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
+				t.Fatalf("expected deferred startup vault detail, payload=%s", payload)
+			}
+
+			cfgSuccess := lifecycleConfig(t)
+			runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+				command := name + " " + strings.Join(args, " ")
+				switch {
+				case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
+					return "Running", nil
+				case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
+					return `{"sealed":false,"initialized":true}`, nil
+				default:
+					return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+				}
+			}
+			orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
+			orchSuccess.TestHookBeginStartupReport("startup-vault")
+			if err := orchSuccess.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
+				t.Fatalf("expected successful startup vault gate, got %v", err)
+			}
+			if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault is unsealed"`) {
+				t.Fatalf("expected successful startup vault detail, payload=%s", payload)
+			}
+		})
+	})
+}
diff --git a/testing/orchestrator/hooks_workload_storage_access_matrix_test.go b/testing/orchestrator/hooks_workload_storage_access_matrix_test.go
index bad5e3f..2871191 100644
--- a/testing/orchestrator/hooks_workload_storage_access_matrix_test.go
+++ b/testing/orchestrator/hooks_workload_storage_access_matrix_test.go
@@ -165,6 +165,32 @@ func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
 				t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
 			}
 		})
+
+		t.Run("wait-for-node-gates-only-require-core-nodes", func(t *testing.T) {
+			cfg := lifecycleConfig(t)
+			cfg.Startup.RequireNodeSSHAuth = true
+			cfg.Startup.NodeSSHAuthWaitSeconds = 1
+			cfg.Startup.NodeSSHAuthPollSeconds = 1
+			cfg.Startup.NodeInventoryReachWaitSeconds = 1
+			cfg.Startup.NodeInventoryReachPollSeconds = 1
+			cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
+			cfg.Startup.NodeSSHAuthRequiredNodes = []string{"titan-db"}
+
+			run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
+				command := name + " " + strings.Join(args, " ")
+				if name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "__ANANKE_") {
+					return "", errors.New("no route to host")
+				}
+				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
+			}
+			orch, _ := newHookOrchestrator(t, cfg, run, run)
+			if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
+				t.Fatalf("expected ssh-auth gate to ignore non-core worker failure, got %v", err)
+			}
+			if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err != nil {
+				t.Fatalf("expected inventory reachability gate to ignore non-core worker failure, got %v", err)
+			}
+		})
 	})
 
 	t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {