From 95fefba244ab4caa9f7fa53c038cfa655e7aa358 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 8 Apr 2026 23:42:09 -0300
Subject: [PATCH] startup: enforce external service behavior checks

---
 internal/cluster/orchestrator_core.go         | 124 ++++++
 .../cluster/orchestrator_service_stability.go | 389 ++++++++++++++++++
 internal/cluster/orchestrator_test.go         | 180 +++++---
 internal/config/apply_defaults.go             | 236 +++++++++++
 internal/config/config_test.go                | 127 ++++++
 internal/config/defaults.go                   | 155 +++++++
 internal/config/startup_service_catalog.go    | 315 ++++++++++++++
 internal/config/types.go                      | 156 +++++++
 8 files changed, 1615 insertions(+), 67 deletions(-)
 create mode 100644 internal/cluster/orchestrator_core.go
 create mode 100644 internal/cluster/orchestrator_service_stability.go
 create mode 100644 internal/config/apply_defaults.go
 create mode 100644 internal/config/defaults.go
 create mode 100644 internal/config/startup_service_catalog.go
 create mode 100644 internal/config/types.go

diff --git a/internal/cluster/orchestrator_core.go b/internal/cluster/orchestrator_core.go
new file mode 100644
index 0000000..667397f
--- /dev/null
+++ b/internal/cluster/orchestrator_core.go
@@ -0,0 +1,124 @@
+package cluster
+
+import (
+	"context"
+	"errors"
+	"log"
+	"regexp"
+	"sync"
+	"time"
+
+	"scm.bstein.dev/bstein/ananke/internal/config"
+	"scm.bstein.dev/bstein/ananke/internal/execx"
+	"scm.bstein.dev/bstein/ananke/internal/state"
+)
+
+type Orchestrator struct {
+	cfg                  config.Config
+	runner               *execx.Runner
+	store                *state.Store
+	log                  *log.Logger
+	runOverride          func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
+	runSensitiveOverride func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
+	startupReportMu      sync.Mutex
+	activeStartupReport  *startupReport
+}
+
+type commandOverrideFunc func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
+
+type StartupOptions struct {
+	ForceFluxBranch    string
+	SkipLocalBootstrap bool
+	Reason             string
+}
+
+type ShutdownOptions struct {
+	SkipEtcdSnapshot bool
+	SkipDrain        bool
+	Mode             string
+	Reason           string
+}
+
+type EtcdRestoreOptions struct {
+	ControlPlane string
+	SnapshotPath string
+}
+
+type startupWorkload struct {
+	Namespace string
+	Kind      string
+	Name      string
+}
+
+type workloadScaleEntry struct {
+	Namespace string `json:"namespace"`
+	Kind      string `json:"kind"`
+	Name      string `json:"name"`
+	Replicas  int    `json:"replicas"`
+}
+
+type remotePeerStatus struct {
+	Intent          state.Intent
+	BootstrapActive bool
+}
+
+type workloadScaleSnapshot struct {
+	GeneratedAt time.Time            `json:"generated_at"`
+	Entries     []workloadScaleEntry `json:"entries"`
+}
+
+type startupReport struct {
+	StartedAt   time.Time                     `json:"started_at"`
+	Completed   time.Time                     `json:"completed_at"`
+	Reason      string                        `json:"reason"`
+	Status      string                        `json:"status"`
+	Phase       string                        `json:"phase"`
+	Success     bool                          `json:"success"`
+	Error       string                        `json:"error,omitempty"`
+	Checks      map[string]startupCheckRecord `json:"checks"`
+	AutoHeals   []string                      `json:"auto_heals"`
+	SourceHost  string                        `json:"source_host"`
+	LastUpdated time.Time                     `json:"last_updated"`
+}
+
+type startupCheckRecord struct {
+	Status    string    `json:"status"`
+	Detail    string    `json:"detail"`
+	UpdatedAt time.Time `json:"updated_at"`
+}
+
+var datastoreEndpointPattern = regexp.MustCompile(`--datastore-endpoint(?:=|\s+)(?:'([^']+)'|"([^"]+)"|([^\s\\]+))`)
+
+var criticalStartupWorkloads = []startupWorkload{
+	{Namespace: "flux-system", Kind: "deployment", Name: "source-controller"},
+	{Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"},
+	{Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"},
+	{Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"},
+	{Namespace: "vault", Kind: "statefulset", Name: "vault"},
+	{Namespace: "postgres", Kind: "statefulset", Name: "postgres"},
+	{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
+	{Namespace: "monitoring", Kind: "deployment", Name: "grafana"},
+	{Namespace: "monitoring", Kind: "statefulset", Name: "victoria-metrics-single-server"},
+	{Namespace: "monitoring", Kind: "deployment", Name: "kube-state-metrics"},
+	{Namespace: "logging", Kind: "deployment", Name: "oauth2-proxy-logs"},
+	{Namespace: "logging", Kind: "deployment", Name: "opensearch-dashboards"},
+	{Namespace: "logging", Kind: "statefulset", Name: "opensearch"},
+}
+
+var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable")
+
+// New runs one orchestration or CLI step.
+// Signature: New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator.
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
+	return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
+}
+
+// SetCommandOverrides runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) SetCommandOverrides(run commandOverrideFunc, runSensitive commandOverrideFunc).
+// Why: enables deterministic integration testing from the top-level testing module
+// without requiring package-local test files or live cluster dependencies.
+func (o *Orchestrator) SetCommandOverrides(run commandOverrideFunc, runSensitive commandOverrideFunc) {
+	o.runOverride = run
+	o.runSensitiveOverride = runSensitive
+}
diff --git a/internal/cluster/orchestrator_service_stability.go b/internal/cluster/orchestrator_service_stability.go
new file mode 100644
index 0000000..3f8a9c1
--- /dev/null
+++ b/internal/cluster/orchestrator_service_stability.go
@@ -0,0 +1,389 @@
+package cluster
+
+import (
+	"context"
+	"crypto/tls"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+	"unicode"
+
+	"scm.bstein.dev/bstein/ananke/internal/config"
+)
+
+// isLikelyHostname runs one orchestration or CLI step.
+// Signature: isLikelyHostname(value string) bool.
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func isLikelyHostname(value string) bool {
+	value = strings.TrimSpace(value)
+	if value == "" {
+		return false
+	}
+	if strings.Contains(value, " ") || strings.Contains(value, "/") {
+		return false
+	}
+	return strings.Contains(value, ".")
+}
+
+// healIngressHostBackendReplicas runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) healIngressHostBackendReplicas(ctx context.Context, host string) ([]string, error).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func (o *Orchestrator) healIngressHostBackendReplicas(ctx context.Context, host string) ([]string, error) {
+	namespaces, err := o.discoverIngressNamespacesForHost(ctx, host)
+	if err != nil {
+		return nil, err
+	}
+	if len(namespaces) == 0 {
+		return nil, nil
+	}
+	targetNamespaces := makeStringSet(namespaces)
+	out, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
+	if err != nil {
+		return nil, fmt.Errorf("query workloads: %w", err)
+	}
+	var list workloadList
+	if err := json.Unmarshal([]byte(out), &list); err != nil {
+		return nil, fmt.Errorf("decode workloads: %w", err)
+	}
+	healed := []string{}
+	for _, item := range list.Items {
+		kind := strings.ToLower(strings.TrimSpace(item.Kind))
+		ns := strings.TrimSpace(item.Metadata.Namespace)
+		name := strings.TrimSpace(item.Metadata.Name)
+		if kind == "" || ns == "" || name == "" {
+			continue
+		}
+		if kind != "deployment" && kind != "statefulset" {
+			continue
+		}
+		if _, ok := targetNamespaces[ns]; !ok {
+			continue
+		}
+		desired := int32(1)
+		if item.Spec.Replicas != nil {
+			desired = *item.Spec.Replicas
+		}
+		if desired >= 1 {
+			continue
+		}
+		workload := startupWorkload{Namespace: ns, Kind: kind, Name: name}
+		if err := o.ensureWorkloadReplicas(ctx, workload, 1); err != nil {
+			if isNotFoundErr(err) {
+				continue
+			}
+			return healed, fmt.Errorf("scale %s/%s/%s to 1: %w", ns, kind, name, err)
+		}
+		healed = append(healed, ns+"/"+kind+"/"+name)
+	}
+	return healed, nil
+}
+
+// waitForServiceChecklist runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error.
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error {
+	wait := time.Duration(o.cfg.Startup.ServiceChecklistWaitSeconds) * time.Second
+	if wait <= 0 {
+		wait = 7 * time.Minute
+	}
+	poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second
+	if poll <= 0 {
+		poll = 5 * time.Second
+	}
+	deadline := time.Now().Add(wait)
+	lastFailure := "unknown"
+	lastLogged := time.Time{}
+	lastRecycleAttempt := time.Time{}
+	lastReplicaHeal := time.Time{}
+	lastIngressHeal := time.Time{}
+	for {
+		o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
+		o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
+		prevFailure := lastFailure
+		ready, detail := o.serviceChecklistReady(ctx)
+		lastFailure = detail
+		if ready {
+			o.log.Printf("external service checklist passed (%s)", detail)
+			return nil
+		}
+		o.maybeAutoHealIngressHostBackends(ctx, &lastIngressHeal, lastFailure)
+		if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second {
+			remaining := time.Until(deadline).Round(time.Second)
+			if remaining < 0 {
+				remaining = 0
+			}
+			o.log.Printf("waiting for external service checklist (%s remaining): %s", remaining, lastFailure)
+			lastLogged = time.Now()
+		}
+		if time.Now().After(deadline) {
+			return fmt.Errorf("startup blocked: external service checklist not satisfied within %s (%s)", wait, lastFailure)
+		}
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(poll):
+		}
+	}
+}
+
+// serviceChecklistReady runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) serviceChecklistReady(ctx context.Context) (bool, string).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func (o *Orchestrator) serviceChecklistReady(ctx context.Context) (bool, string) {
+	checks := o.cfg.Startup.ServiceChecklist
+	if len(checks) == 0 {
+		return true, "no checklist items configured"
+	}
+	for _, check := range checks {
+		ok, detail := o.serviceCheckReady(ctx, check)
+		if !ok {
+			name := strings.TrimSpace(check.Name)
+			if name == "" {
+				name = strings.TrimSpace(check.URL)
+			}
+			return false, fmt.Sprintf("%s: %s", name, detail)
+		}
+	}
+	return true, fmt.Sprintf("checks=%d", len(checks))
+}
+
+// serviceCheckReady runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.ServiceChecklistCheck) (bool, string).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.ServiceChecklistCheck) (bool, string) {
+	result, err := o.httpChecklistProbeResult(ctx, check)
+	if err != nil {
+		return false, err.Error()
+	}
+
+	accepted := check.AcceptedStatuses
+	if len(accepted) == 0 {
+		accepted = []int{200, 201, 202, 203, 204, 301, 302, 303, 307, 308, 401, 403}
+	}
+	statusOk := false
+	for _, code := range accepted {
+		if result.Status == code {
+			statusOk = true
+			break
+		}
+	}
+	if !statusOk {
+		return false, fmt.Sprintf("unexpected status code=%d", result.Status)
+	}
+
+	locationContains := strings.TrimSpace(check.LocationContains)
+	if locationContains != "" && !checklistContains(result.Location, locationContains) {
+		return false, fmt.Sprintf("location header missing expected marker %q", locationContains)
+	}
+
+	locationNotContains := strings.TrimSpace(check.LocationNotContains)
+	if locationNotContains != "" && checklistContains(result.Location, locationNotContains) {
+		return false, fmt.Sprintf("location header contained forbidden marker %q", locationNotContains)
+	}
+
+	bodyContains := strings.TrimSpace(check.BodyContains)
+	if bodyContains != "" && !checklistContains(result.Body, bodyContains) {
+		return false, fmt.Sprintf("response missing expected marker %q", bodyContains)
+	}
+
+	bodyNotContains := strings.TrimSpace(check.BodyNotContains)
+	if bodyNotContains != "" && checklistContains(result.Body, bodyNotContains) {
+		return false, fmt.Sprintf("response contained forbidden marker %q", bodyNotContains)
+	}
+
+	return true, fmt.Sprintf("status=%d", result.Status)
+}
+
+type checklistHTTPProbeResult struct {
+	Status   int
+	Body     string
+	Location string
+}
+
+// httpChecklistProbeResult runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error).
+// Why: checklist checks need response headers (for redirect verification) in
+// addition to status/body so startup can validate real user-facing behavior.
+func (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error) {
+	result := checklistHTTPProbeResult{}
+	status, body, location, err := o.httpChecklistProbeWithLocation(ctx, check)
+	if err != nil {
+		return result, err
+	}
+	result.Status = status
+	result.Body = body
+	result.Location = location
+	return result, nil
+}
+
+// httpChecklistProbe runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error) {
+	status, body, _, err := o.httpChecklistProbeWithLocation(ctx, check)
+	return status, body, err
+}
+
+// httpChecklistProbeWithLocation runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error).
+// Why: redirects and auth gates require location-header assertions to prevent
+// startup false-positives on partially healthy protected services.
+func (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error) {
+	timeout := time.Duration(check.TimeoutSeconds) * time.Second
+	if timeout <= 0 {
+		timeout = 12 * time.Second
+	}
+
+	transport := &http.Transport{}
+	if check.InsecureSkipTLS {
+		transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
+	}
+	client := &http.Client{
+		Timeout:   timeout,
+		Transport: transport,
+		CheckRedirect: func(_ *http.Request, _ []*http.Request) error {
+			return http.ErrUseLastResponse
+		},
+	}
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimSpace(check.URL), nil)
+	if err != nil {
+		return 0, "", "", fmt.Errorf("build request: %w", err)
+	}
+	req.Header.Set("User-Agent", "ananke/startup-checklist")
+
+	resp, err := client.Do(req)
+	if err != nil {
+		return 0, "", "", fmt.Errorf("request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	body, readErr := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
+	if readErr != nil {
+		return resp.StatusCode, "", "", fmt.Errorf("read response body: %w", readErr)
+	}
+
+	return resp.StatusCode, string(body), strings.TrimSpace(resp.Header.Get("Location")), nil
+}
+
+// checklistContains runs one orchestration or CLI step.
+// Signature: checklistContains(body, marker string) bool.
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func checklistContains(body, marker string) bool {
+	bodyLower := strings.ToLower(body)
+	markerLower := strings.ToLower(marker)
+	if strings.Contains(bodyLower, markerLower) {
+		return true
+	}
+	bodyCompact := compactLowerNoSpace(bodyLower)
+	markerCompact := compactLowerNoSpace(markerLower)
+	if markerCompact == "" {
+		return true
+	}
+	return strings.Contains(bodyCompact, markerCompact)
+}
+
+// compactLowerNoSpace runs one orchestration or CLI step.
+// Signature: compactLowerNoSpace(s string) string.
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func compactLowerNoSpace(s string) string {
+	var b strings.Builder
+	b.Grow(len(s))
+	for _, r := range s {
+		if unicode.IsSpace(r) {
+			continue
+		}
+		b.WriteRune(r)
+	}
+	return b.String()
+}
+
+// waitForStabilityWindow runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error.
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
+	window := time.Duration(o.cfg.Startup.ServiceChecklistStabilitySec) * time.Second
+	if window <= 0 {
+		return nil
+	}
+	poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second
+	if poll <= 0 {
+		poll = 5 * time.Second
+	}
+	deadline := time.Now().Add(window)
+	lastStatus := time.Time{}
+	lastRecycleAttempt := time.Time{}
+	lastReplicaHeal := time.Time{}
+
+	for {
+		o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
+		o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
+		if err := o.startupStabilityHealthy(ctx); err != nil {
+			return fmt.Errorf("startup stability window failed: %w", err)
+		}
+		if time.Now().After(deadline) {
+			o.log.Printf("startup stability window passed (%s)", window)
+			return nil
+		}
+		if time.Since(lastStatus) >= 30*time.Second {
+			remaining := time.Until(deadline).Round(time.Second)
+			if remaining < 0 {
+				remaining = 0
+			}
+			o.log.Printf("startup stability soak in progress (%s remaining)", remaining)
+			lastStatus = time.Now()
+		}
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(poll):
+		}
+	}
+}
+
+// startupStabilityHealthy runs one orchestration or CLI step.
+// Signature: (o *Orchestrator) startupStabilityHealthy(ctx context.Context) error.
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func (o *Orchestrator) startupStabilityHealthy(ctx context.Context) error {
+	if o.cfg.Startup.RequireFluxHealth {
+		ready, detail, err := o.fluxHealthReady(ctx)
+		if err != nil {
+			return fmt.Errorf("flux check error: %w", err)
+		}
+		if !ready {
+			return fmt.Errorf("flux not ready: %s", detail)
+		}
+	}
+	if o.cfg.Startup.RequireWorkloadConvergence {
+		ready, detail, err := o.workloadConvergenceReady(ctx)
+		if err != nil {
+			return fmt.Errorf("workload check error: %w", err)
+		}
+		if !ready {
+			return fmt.Errorf("workloads not converged: %s", detail)
+		}
+	}
+	if o.cfg.Startup.RequireServiceChecklist {
+		ready, detail := o.serviceChecklistReady(ctx)
+		if !ready {
+			return fmt.Errorf("external services not healthy: %s", detail)
+		}
+	}
+	if o.cfg.Startup.RequireIngressChecklist {
+		ready, detail := o.ingressChecklistReady(ctx)
+		if !ready {
+			return fmt.Errorf("ingress reachability not healthy: %s", detail)
+		}
+	}
+	failures, err := o.startupFailurePods(ctx)
+	if err != nil {
+		return fmt.Errorf("pod failure check error: %w", err)
+	}
+	if len(failures) > 0 {
+		return fmt.Errorf("pods in crash/image-pull failures: %s", joinLimited(failures, 8))
+	}
+	return nil
+}
diff --git a/internal/cluster/orchestrator_test.go b/internal/cluster/orchestrator_test.go
index c9254ca..30ef6c5 100644
--- a/internal/cluster/orchestrator_test.go
+++ b/internal/cluster/orchestrator_test.go
@@ -15,6 +15,9 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
 
+// TestParseVaultSealed runs one orchestration or CLI step.
+// Signature: TestParseVaultSealed(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseVaultSealed(t *testing.T) {
 	sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`)
 	if err != nil {
@@ -33,12 +36,18 @@ func TestParseVaultSealed(t *testing.T) {
 	}
 }
 
+// TestParseVaultSealedRejectsEmpty runs one orchestration or CLI step.
+// Signature: TestParseVaultSealedRejectsEmpty(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseVaultSealedRejectsEmpty(t *testing.T) {
 	if _, err := parseVaultSealed("   "); err == nil {
 		t.Fatalf("expected parse error for empty status payload")
 	}
 }
 
+// TestParseVaultSealedWithKubectlPreamble runs one orchestration or CLI step.
+// Signature: TestParseVaultSealedWithKubectlPreamble(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
 	raw := "Defaulted container \"vault\" out of: vault, setup-config (init)\n{\"sealed\":true,\"initialized\":true}\n"
 	sealed, err := parseVaultSealed(raw)
@@ -50,6 +59,9 @@ func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
 	}
 }
 
+// TestFallbackWorkersFromInventoryUsesManagedNodes runs one orchestration or CLI step.
+// Signature: TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
@@ -70,6 +82,9 @@ func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
 	}
 }
 
+// TestFallbackWorkersFromInventoryFallsBackToHosts runs one orchestration or CLI step.
+// Signature: TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
@@ -89,12 +104,18 @@ func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
 	}
 }
 
+// TestIntentFreshTreatsZeroTimestampAsFresh runs one orchestration or CLI step.
+// Signature: TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T) {
 	if !intentFresh(state.Intent{}, 30*time.Second) {
 		t.Fatalf("zero updated_at intent should be treated as fresh")
 	}
 }
 
+// TestIntentFreshRespectsAge runs one orchestration or CLI step.
+// Signature: TestIntentFreshRespectsAge(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestIntentFreshRespectsAge(t *testing.T) {
 	stale := state.Intent{UpdatedAt: time.Now().Add(-2 * time.Minute)}
 	fresh := state.Intent{UpdatedAt: time.Now().Add(-20 * time.Second)}
@@ -106,6 +127,9 @@ func TestIntentFreshRespectsAge(t *testing.T) {
 	}
 }
 
+// TestCoordinationPeersDedupesAndIncludesForwardHost runs one orchestration or CLI step.
+// Signature: TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
@@ -122,6 +146,9 @@ func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
 	}
 }
 
+// TestWorkloadTargetsIgnoredNodesByNodeSelector runs one orchestration or CLI step.
+// Signature: TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
 	spec := podSpec{
 		NodeSelector: map[string]string{
@@ -134,6 +161,9 @@ func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
 	}
 }
 
+// TestParseWorkloadIgnoreRules runs one orchestration or CLI step.
+// Signature: TestParseWorkloadIgnoreRules(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseWorkloadIgnoreRules(t *testing.T) {
 	rules := parseWorkloadIgnoreRules([]string{
 		"maintenance/metis",
@@ -153,6 +183,9 @@ func TestParseWorkloadIgnoreRules(t *testing.T) {
 	}
 }
 
+// TestNamespaceCandidatesFromIgnoreKustomizations runs one orchestration or CLI step.
+// Signature: TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
 	got := namespaceCandidatesFromIgnoreKustomizations([]string{
 		"flux-system/jellyfin",
@@ -166,12 +199,18 @@ func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
 	}
 }
 
+// TestProbeStatusAcceptedRejects404 runs one orchestration or CLI step.
+// Signature: TestProbeStatusAcceptedRejects404(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestProbeStatusAcceptedRejects404(t *testing.T) {
 	if probeStatusAccepted("https://metrics.bstein.dev/login", 404) {
 		t.Fatalf("expected 404 probe status to be rejected")
 	}
 }
 
+// TestParseFluxKustomizationTimeout runs one orchestration or CLI step.
+// Signature: TestParseFluxKustomizationTimeout(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseFluxKustomizationTimeout(t *testing.T) {
 	if got := parseFluxKustomizationTimeout("30m"); got != 30*time.Minute {
 		t.Fatalf("expected 30m duration, got %s", got)
@@ -187,6 +226,9 @@ func TestParseFluxKustomizationTimeout(t *testing.T) {
 	}
 }
 
+// TestServiceCheckReadyRequiresBodyContains runs one orchestration or CLI step.
+// Signature: TestServiceCheckReadyRequiresBodyContains(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
@@ -209,6 +251,9 @@ func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
 	}
 }
 
+// TestServiceCheckReadyBodyContainsIgnoresWhitespace runs one orchestration or CLI step.
+// Signature: TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
@@ -231,6 +276,62 @@ func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
 	}
 }
 
+// TestServiceCheckReadyRequiresLocationContains runs one orchestration or CLI step.
+// Signature: TestServiceCheckReadyRequiresLocationContains(t *testing.T).
+// Why: startup checks must validate redirect targets for OIDC-gated services.
+func TestServiceCheckReadyRequiresLocationContains(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=logs")
+		w.WriteHeader(http.StatusFound)
+	}))
+	defer srv.Close()
+
+	orch := &Orchestrator{
+		log: log.New(os.Stdout, "", 0),
+	}
+	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
+		Name:             "logging-oidc-redirect",
+		URL:              srv.URL,
+		AcceptedStatuses: []int{302},
+		LocationContains: "client_id=logs",
+		TimeoutSeconds:   5,
+	})
+	if !ok {
+		t.Fatalf("expected location-aware service check to pass, detail=%s", detail)
+	}
+}
+
+// TestServiceCheckReadyRejectsMissingLocationMarker runs one orchestration or CLI step.
+// Signature: TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T).
+// Why: prevents false positives when redirects point somewhere unexpected.
+func TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=wrong")
+		w.WriteHeader(http.StatusFound)
+	}))
+	defer srv.Close()
+
+	orch := &Orchestrator{
+		log: log.New(os.Stdout, "", 0),
+	}
+	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
+		Name:             "logging-oidc-redirect",
+		URL:              srv.URL,
+		AcceptedStatuses: []int{302},
+		LocationContains: "client_id=logs",
+		TimeoutSeconds:   5,
+	})
+	if ok {
+		t.Fatalf("expected location-aware service check to fail")
+	}
+	if !strings.Contains(detail, "location header missing expected marker") {
+		t.Fatalf("expected missing location marker detail, got %q", detail)
+	}
+}
+
+// TestChecklistFailureHostFromIngressDetail runs one orchestration or CLI step.
+// Signature: TestChecklistFailureHostFromIngressDetail(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestChecklistFailureHostFromIngressDetail(t *testing.T) {
 	orch := &Orchestrator{}
 	got := orch.checklistFailureHost("cloud.bstein.dev: unexpected status code=500")
@@ -239,6 +340,9 @@ func TestChecklistFailureHostFromIngressDetail(t *testing.T) {
 	}
 }
 
+// TestChecklistFailureHostFromServiceCheckName runs one orchestration or CLI step.
+// Signature: TestChecklistFailureHostFromServiceCheckName(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestChecklistFailureHostFromServiceCheckName(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
@@ -258,6 +362,9 @@ func TestChecklistFailureHostFromServiceCheckName(t *testing.T) {
 	}
 }
 
+// TestChecklistFailureHostUnknown runs one orchestration or CLI step.
+// Signature: TestChecklistFailureHostUnknown(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestChecklistFailureHostUnknown(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
@@ -279,6 +386,9 @@ func TestChecklistFailureHostUnknown(t *testing.T) {
 	}
 }
 
+// TestStuckVaultInitReasonDetectsHungInit runs one orchestration or CLI step.
+// Signature: TestStuckVaultInitReasonDetectsHungInit(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
 	var pod podResource
 	pod.Status.Phase = "Pending"
@@ -302,6 +412,9 @@ func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
 	}
 }
 
+// TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods runs one orchestration or CLI step.
+// Signature: TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
 	var pod podResource
 	pod.Status.Phase = "Pending"
@@ -328,70 +441,3 @@ func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
 		t.Fatalf("expected no reason for non-vault pod, got %q", reason)
 	}
 }
-
-func TestValidateNodeInventoryPassesForStrictMappings(t *testing.T) {
-	orch := &Orchestrator{
-		cfg: config.Config{
-			SSHUser: "atlas",
-			SSHPort: 2277,
-			SSHNodeHosts: map[string]string{
-				"titan-0a": "192.168.22.11",
-				"titan-0b": "192.168.22.12",
-				"titan-0c": "192.168.22.13",
-				"titan-22": "192.168.22.22",
-			},
-			SSHManagedNodes: []string{"titan-0a", "titan-0b", "titan-0c", "titan-22"},
-			ControlPlanes:   []string{"titan-0a", "titan-0b", "titan-0c"},
-			Workers:         []string{"titan-22"},
-		},
-		log: log.New(os.Stdout, "", 0),
-	}
-	if err := orch.validateNodeInventory(); err != nil {
-		t.Fatalf("expected inventory to pass, got error: %v", err)
-	}
-}
-
-func TestValidateNodeInventoryFailsWhenNodeMappingMissing(t *testing.T) {
-	orch := &Orchestrator{
-		cfg: config.Config{
-			SSHUser:         "atlas",
-			SSHPort:         2277,
-			SSHNodeHosts:    map[string]string{"titan-0a": "192.168.22.11"},
-			SSHManagedNodes: []string{"titan-0a", "titan-0b"},
-			ControlPlanes:   []string{"titan-0a"},
-			Workers:         []string{"titan-0b"},
-		},
-		log: log.New(os.Stdout, "", 0),
-	}
-	err := orch.validateNodeInventory()
-	if err == nil {
-		t.Fatalf("expected inventory error for missing mapping")
-	}
-	if !strings.Contains(err.Error(), "missing ssh_node_hosts entry") {
-		t.Fatalf("expected missing-mapping detail, got: %v", err)
-	}
-}
-
-func TestValidateNodeInventoryFailsWhenWorkerNotManaged(t *testing.T) {
-	orch := &Orchestrator{
-		cfg: config.Config{
-			SSHUser: "atlas",
-			SSHPort: 2277,
-			SSHNodeHosts: map[string]string{
-				"titan-0a": "192.168.22.11",
-				"titan-22": "192.168.22.22",
-			},
-			SSHManagedNodes: []string{"titan-0a"},
-			ControlPlanes:   []string{"titan-0a"},
-			Workers:         []string{"titan-22"},
-		},
-		log: log.New(os.Stdout, "", 0),
-	}
-	err := orch.validateNodeInventory()
-	if err == nil {
-		t.Fatalf("expected inventory error for unmanaged worker")
-	}
-	if !strings.Contains(err.Error(), "missing from ssh_managed_nodes") {
-		t.Fatalf("expected unmanaged-worker detail, got: %v", err)
-	}
-}
diff --git a/internal/config/apply_defaults.go b/internal/config/apply_defaults.go
new file mode 100644
index 0000000..0a6a75b
--- /dev/null
+++ b/internal/config/apply_defaults.go
@@ -0,0 +1,236 @@
+package config
+
+import "strings"
+
+// applyDefaults runs one orchestration or CLI step.
+// Signature: (c *Config) applyDefaults().
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func (c *Config) applyDefaults() {
+	if c.ExpectedFluxBranch == "" {
+		c.ExpectedFluxBranch = "main"
+	}
+	if c.IACRepoPath == "" {
+		c.IACRepoPath = "/opt/titan-iac"
+	}
+	if c.ExpectedFluxSource == "" {
+		c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
+	}
+	if c.Startup.APIWaitSeconds <= 0 {
+		c.Startup.APIWaitSeconds = 1200
+	}
+	if c.Startup.APIPollSeconds <= 0 {
+		c.Startup.APIPollSeconds = 2
+	}
+	if c.Startup.ShutdownCooldownSeconds <= 0 {
+		c.Startup.ShutdownCooldownSeconds = 45
+	}
+	if c.Startup.MinimumBatteryPercent <= 0 {
+		c.Startup.MinimumBatteryPercent = 20
+	}
+	if c.Startup.NodeInventoryReachWaitSeconds <= 0 {
+		c.Startup.NodeInventoryReachWaitSeconds = 300
+	}
+	if c.Startup.NodeInventoryReachPollSeconds <= 0 {
+		c.Startup.NodeInventoryReachPollSeconds = 5
+	}
+	if c.Startup.RequiredNodeLabels == nil {
+		c.Startup.RequiredNodeLabels = map[string]map[string]string{
+			"titan-09": {
+				"ananke.bstein.dev/harbor-bootstrap": "true",
+			},
+		}
+	}
+	if c.Startup.TimeSyncWaitSeconds <= 0 {
+		c.Startup.TimeSyncWaitSeconds = 240
+	}
+	if c.Startup.TimeSyncPollSeconds <= 0 {
+		c.Startup.TimeSyncPollSeconds = 5
+	}
+	if c.Startup.TimeSyncMode == "" {
+		c.Startup.TimeSyncMode = "quorum"
+	}
+	if c.Startup.TimeSyncQuorum <= 0 {
+		c.Startup.TimeSyncQuorum = 2
+	}
+	if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 {
+		c.Startup.TimeSyncQuorum = len(c.ControlPlanes)
+	}
+	if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
+		c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
+	}
+	if c.Startup.StorageReadyWaitSeconds <= 0 {
+		c.Startup.StorageReadyWaitSeconds = 420
+	}
+	if c.Startup.StorageReadyPollSeconds <= 0 {
+		c.Startup.StorageReadyPollSeconds = 5
+	}
+	if c.Startup.StorageMinReadyNodes <= 0 {
+		c.Startup.StorageMinReadyNodes = 2
+	}
+	if len(c.Startup.StorageCriticalPVCs) == 0 {
+		c.Startup.StorageCriticalPVCs = []string{
+			"vault/data-vault-0",
+			"postgres/postgres-data-postgres-0",
+			"gitea/gitea-data",
+			"sso/keycloak-data",
+		}
+	}
+	if c.Startup.PostStartProbeWaitSeconds <= 0 {
+		c.Startup.PostStartProbeWaitSeconds = 240
+	}
+	if c.Startup.PostStartProbePollSeconds <= 0 {
+		c.Startup.PostStartProbePollSeconds = 5
+	}
+	if len(c.Startup.PostStartProbes) == 0 {
+		c.Startup.PostStartProbes = []string{
+			"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
+			"https://scm.bstein.dev/api/healthz",
+			"https://metrics.bstein.dev/api/health",
+		}
+	}
+	if c.Startup.ServiceChecklistWaitSeconds <= 0 {
+		c.Startup.ServiceChecklistWaitSeconds = 420
+	}
+	if c.Startup.ServiceChecklistPollSeconds <= 0 {
+		c.Startup.ServiceChecklistPollSeconds = 5
+	}
+	if c.Startup.ServiceChecklistStabilitySec < 0 {
+		c.Startup.ServiceChecklistStabilitySec = 0
+	}
+	c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
+	for i := range c.Startup.ServiceChecklist {
+		if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
+			c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
+		}
+	}
+	if c.Startup.CriticalServiceEndpointWaitSec <= 0 {
+		c.Startup.CriticalServiceEndpointWaitSec = 420
+	}
+	if c.Startup.CriticalServiceEndpointPollSec <= 0 {
+		c.Startup.CriticalServiceEndpointPollSec = 5
+	}
+	c.Startup.CriticalServiceEndpoints = mergeStringDefaults(c.Startup.CriticalServiceEndpoints, defaultCriticalServiceEndpoints())
+	if c.Startup.IngressChecklistWaitSeconds <= 0 {
+		c.Startup.IngressChecklistWaitSeconds = 420
+	}
+	if c.Startup.IngressChecklistPollSeconds <= 0 {
+		c.Startup.IngressChecklistPollSeconds = 5
+	}
+	if len(c.Startup.IngressChecklistAccepted) == 0 {
+		c.Startup.IngressChecklistAccepted = []int{200, 301, 302, 307, 308, 401, 403, 404}
+	}
+	if c.Startup.IngressChecklistIgnoreHosts == nil {
+		c.Startup.IngressChecklistIgnoreHosts = []string{}
+	}
+	if c.Startup.NodeSSHAuthWaitSeconds <= 0 {
+		c.Startup.NodeSSHAuthWaitSeconds = 240
+	}
+	if c.Startup.NodeSSHAuthPollSeconds <= 0 {
+		c.Startup.NodeSSHAuthPollSeconds = 5
+	}
+	if c.Startup.FluxHealthWaitSeconds <= 0 {
+		c.Startup.FluxHealthWaitSeconds = 900
+	}
+	if c.Startup.FluxHealthPollSeconds <= 0 {
+		c.Startup.FluxHealthPollSeconds = 5
+	}
+	if c.Startup.IgnoreFluxKustomizations == nil {
+		c.Startup.IgnoreFluxKustomizations = []string{}
+	}
+	if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
+		c.Startup.WorkloadConvergenceWaitSeconds = 900
+	}
+	if c.Startup.WorkloadConvergencePollSeconds <= 0 {
+		c.Startup.WorkloadConvergencePollSeconds = 5
+	}
+	if c.Startup.IgnoreWorkloadNamespaces == nil {
+		c.Startup.IgnoreWorkloadNamespaces = []string{}
+	}
+	if c.Startup.IgnoreWorkloads == nil {
+		c.Startup.IgnoreWorkloads = []string{}
+	}
+	if c.Startup.IgnoreUnavailableNodes == nil {
+		c.Startup.IgnoreUnavailableNodes = []string{}
+	}
+	if c.Startup.StuckPodGraceSeconds <= 0 {
+		c.Startup.StuckPodGraceSeconds = 180
+	}
+	if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
+		c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
+	}
+	if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
+		c.Startup.VaultUnsealBreakglassTimeout = 15
+	}
+	if c.SSHPort <= 0 {
+		c.SSHPort = 2277
+	}
+	if c.Shutdown.DefaultBudgetSeconds <= 0 {
+		c.Shutdown.DefaultBudgetSeconds = 1380
+	}
+	if c.Shutdown.HistoryMinSamples <= 0 {
+		c.Shutdown.HistoryMinSamples = 3
+	}
+	if c.Shutdown.EmergencyBudgetSec <= 0 {
+		c.Shutdown.EmergencyBudgetSec = 420
+	}
+	if c.Shutdown.EmergencyMinSamples <= 0 {
+		c.Shutdown.EmergencyMinSamples = 3
+	}
+	if c.Shutdown.DrainParallelism <= 0 {
+		c.Shutdown.DrainParallelism = 6
+	}
+	if c.Shutdown.ScaleParallelism <= 0 {
+		c.Shutdown.ScaleParallelism = 8
+	}
+	if c.Shutdown.SSHParallelism <= 0 {
+		c.Shutdown.SSHParallelism = 8
+	}
+	if c.UPS.PollSeconds <= 0 {
+		c.UPS.PollSeconds = 5
+	}
+	if c.UPS.RuntimeSafetyFactor <= 0 {
+		c.UPS.RuntimeSafetyFactor = 1.25
+	}
+	if c.UPS.DebounceCount <= 0 {
+		c.UPS.DebounceCount = 3
+	}
+	if c.UPS.TelemetryTimeoutSeconds <= 0 {
+		c.UPS.TelemetryTimeoutSeconds = 90
+	}
+	if c.Coordination.ForwardShutdownConfig == "" {
+		c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
+	}
+	if c.Coordination.PeerHosts == nil {
+		c.Coordination.PeerHosts = []string{}
+	}
+	if c.Coordination.CommandTimeoutSeconds <= 0 {
+		c.Coordination.CommandTimeoutSeconds = 25
+	}
+	if c.Coordination.StartupGuardMaxAgeSec <= 0 {
+		c.Coordination.StartupGuardMaxAgeSec = 900
+	}
+	if c.Coordination.Role == "" {
+		c.Coordination.Role = "coordinator"
+	}
+	if c.Metrics.BindAddr == "" {
+		c.Metrics.BindAddr = "0.0.0.0:9560"
+	}
+	if c.Metrics.Path == "" {
+		c.Metrics.Path = "/metrics"
+	}
+	if c.State.Dir == "" {
+		c.State.Dir = "/var/lib/ananke"
+	}
+	if c.State.ReportsDir == "" {
+		c.State.ReportsDir = "/var/lib/ananke/reports"
+	}
+	if c.State.RunHistoryPath == "" {
+		c.State.RunHistoryPath = "/var/lib/ananke/runs.json"
+	}
+	if c.State.LockPath == "" {
+		c.State.LockPath = "/var/lib/ananke/ananke.lock"
+	}
+	if c.State.IntentPath == "" {
+		c.State.IntentPath = "/var/lib/ananke/intent.json"
+	}
+}
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
index d4af8fa..07f6f61 100644
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -7,6 +7,9 @@ import (
 	"testing"
 )
 
+// TestLoadAcceptsUPSTargets runs one orchestration or CLI step.
+// Signature: TestLoadAcceptsUPSTargets(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestLoadAcceptsUPSTargets(t *testing.T) {
 	tmp := t.TempDir()
 	cfgPath := filepath.Join(tmp, "ananke.yaml")
@@ -39,6 +42,9 @@ state:
 	}
 }
 
+// TestValidateForwardShutdownRequiresConfigPath runs one orchestration or CLI step.
+// Signature: TestValidateForwardShutdownRequiresConfigPath(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) {
 	cfg := defaults()
 	cfg.Coordination.ForwardShutdownHost = "titan-db"
@@ -48,6 +54,9 @@ func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) {
 	}
 }
 
+// TestValidateRejectsUnknownRole runs one orchestration or CLI step.
+// Signature: TestValidateRejectsUnknownRole(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsUnknownRole(t *testing.T) {
 	cfg := defaults()
 	cfg.Coordination.Role = "unknown"
@@ -56,6 +65,9 @@ func TestValidateRejectsUnknownRole(t *testing.T) {
 	}
 }
 
+// TestValidateRejectsEmptyPeerHostEntry runs one orchestration or CLI step.
+// Signature: TestValidateRejectsEmptyPeerHostEntry(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
 	cfg := defaults()
 	cfg.Coordination.PeerHosts = []string{"titan-24", " "}
@@ -64,6 +76,9 @@ func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
 	}
 }
 
+// TestValidateRejectsUnknownEtcdRestoreControlPlane runs one orchestration or CLI step.
+// Signature: TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.EtcdRestoreControlPlane = "titan-missing"
@@ -72,6 +87,9 @@ func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
 	}
 }
 
+// TestLoadSetsCoordinationGuardDefaults runs one orchestration or CLI step.
+// Signature: TestLoadSetsCoordinationGuardDefaults(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
 	tmp := t.TempDir()
 	cfgPath := filepath.Join(tmp, "ananke.yaml")
@@ -114,6 +132,9 @@ state:
 	}
 }
 
+// TestValidateRejectsInvalidStartupShutdownCooldown runs one orchestration or CLI step.
+// Signature: TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.ShutdownCooldownSeconds = 0
@@ -122,6 +143,9 @@ func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) {
 	}
 }
 
+// TestValidateRejectsInvalidTimeSyncMode runs one orchestration or CLI step.
+// Signature: TestValidateRejectsInvalidTimeSyncMode(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.TimeSyncMode = "invalid"
@@ -130,6 +154,9 @@ func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
 	}
 }
 
+// TestValidateRejectsBadStoragePVCFormat runs one orchestration or CLI step.
+// Signature: TestValidateRejectsBadStoragePVCFormat(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsBadStoragePVCFormat(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.StorageCriticalPVCs = []string{"vault-data-vault-0"}
@@ -138,6 +165,9 @@ func TestValidateRejectsBadStoragePVCFormat(t *testing.T) {
 	}
 }
 
+// TestValidateRejectsMissingPostStartProbesWhenRequired runs one orchestration or CLI step.
+// Signature: TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.RequirePostStartProbes = true
@@ -147,6 +177,9 @@ func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
 	}
 }
 
+// TestValidateRejectsMissingServiceChecklistWhenRequired runs one orchestration or CLI step.
+// Signature: TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.RequireServiceChecklist = true
@@ -156,6 +189,9 @@ func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
 	}
 }
 
+// TestValidateRejectsBadServiceChecklistURL runs one orchestration or CLI step.
+// Signature: TestValidateRejectsBadServiceChecklistURL(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
@@ -171,6 +207,9 @@ func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
 	}
 }
 
+// TestValidateRejectsBadIgnoreFluxKustomizationFormat runs one orchestration or CLI step.
+// Signature: TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.IgnoreFluxKustomizations = []string{"jellyfin"}
@@ -179,6 +218,9 @@ func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
 	}
 }
 
+// TestValidateRejectsBadIgnoreWorkloadFormat runs one orchestration or CLI step.
+// Signature: TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.IgnoreWorkloads = []string{"maintenance/metis/extra/value"}
@@ -187,6 +229,9 @@ func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
 	}
 }
 
+// TestValidateRejectsInvalidRequiredNodeLabel runs one orchestration or CLI step.
+// Signature: TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
@@ -198,3 +243,85 @@ func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
 		t.Fatalf("expected validation error for invalid required_node_labels entry")
 	}
 }
+
+// TestValidateRejectsInvalidNodeInventoryReachWindow runs one orchestration or CLI step.
+// Signature: TestValidateRejectsInvalidNodeInventoryReachWindow(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func TestValidateRejectsInvalidNodeInventoryReachWindow(t *testing.T) {
+	cfg := defaults()
+	cfg.Startup.NodeInventoryReachWaitSeconds = 0
+	if err := cfg.Validate(); err == nil {
+		t.Fatalf("expected validation error for invalid node_inventory_reachability_wait_seconds")
+	}
+}
+
+// TestValidateRejectsMissingReportsDir runs one orchestration or CLI step.
+// Signature: TestValidateRejectsMissingReportsDir(t *testing.T).
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func TestValidateRejectsMissingReportsDir(t *testing.T) {
+	cfg := defaults()
+	cfg.State.ReportsDir = ""
+	if err := cfg.Validate(); err == nil {
+		t.Fatalf("expected validation error for missing state.reports_dir")
+	}
+}
+
+// TestApplyDefaultsMergesServiceChecklistDefaults runs one orchestration or CLI step.
+// Signature: TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T).
+// Why: host configs may define a partial checklist; startup still needs the
+// baseline service validations learned from drills.
+func TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T) {
+	cfg := Config{
+		Startup: Startup{
+			ServiceChecklist: []ServiceChecklistCheck{
+				{
+					Name:           "custom-smoke",
+					URL:            "https://example.invalid/healthz",
+					TimeoutSeconds: 7,
+				},
+			},
+		},
+	}
+	cfg.applyDefaults()
+
+	names := map[string]struct{}{}
+	for _, check := range cfg.Startup.ServiceChecklist {
+		names[check.Name] = struct{}{}
+	}
+	if _, ok := names["custom-smoke"]; !ok {
+		t.Fatalf("expected custom checklist entry to be preserved")
+	}
+	if _, ok := names["logging-oidc-redirect"]; !ok {
+		t.Fatalf("expected default logging redirect check to be merged in")
+	}
+	if _, ok := names["vaultwarden-ui"]; !ok {
+		t.Fatalf("expected default vaultwarden check to be merged in")
+	}
+}
+
+// TestApplyDefaultsMergesCriticalServiceEndpointDefaults runs one orchestration or CLI step.
+// Signature: TestApplyDefaultsMergesCriticalServiceEndpointDefaults(t *testing.T).
+// Why: startup endpoint gating must keep baseline backend checks even when host
+// configs only provide a subset.
+func TestApplyDefaultsMergesCriticalServiceEndpointDefaults(t *testing.T) {
+	cfg := Config{
+		Startup: Startup{
+			CriticalServiceEndpoints: []string{"customns/customsvc"},
+		},
+	}
+	cfg.applyDefaults()
+
+	seen := map[string]struct{}{}
+	for _, entry := range cfg.Startup.CriticalServiceEndpoints {
+		seen[entry] = struct{}{}
+	}
+	if _, ok := seen["customns/customsvc"]; !ok {
+		t.Fatalf("expected custom critical endpoint to be preserved")
+	}
+	if _, ok := seen["logging/opensearch-dashboards"]; !ok {
+		t.Fatalf("expected logging/opensearch-dashboards critical endpoint default")
+	}
+	if _, ok := seen["monitoring/victoria-metrics-single-server"]; !ok {
+		t.Fatalf("expected monitoring/victoria-metrics-single-server critical endpoint default")
+	}
+}
diff --git a/internal/config/defaults.go b/internal/config/defaults.go
new file mode 100644
index 0000000..a848a40
--- /dev/null
+++ b/internal/config/defaults.go
@@ -0,0 +1,155 @@
+package config
+
+// defaults runs one orchestration or CLI step.
+// Signature: defaults() Config.
+// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
+func defaults() Config {
+	c := Config{
+		IACRepoPath:        "/opt/titan-iac",
+		ExpectedFluxBranch: "main",
+		ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
+		SSHPort:            2277,
+		ControlPlanes:      []string{"titan-0a", "titan-0b", "titan-0c"},
+		LocalBootstrapPaths: []string{
+			"infrastructure/core",
+			"clusters/atlas/flux-system",
+			"infrastructure/sources/helm",
+			"infrastructure/metallb",
+			"infrastructure/traefik",
+			"infrastructure/cert-manager",
+			"infrastructure/vault-csi",
+			"infrastructure/vault-injector",
+			"services/vault",
+			"infrastructure/postgres",
+			"services/gitea",
+			"services/keycloak",
+			"services/oauth2-proxy",
+		},
+		ExcludedNamespaces: []string{
+			"kube-system",
+			"kube-public",
+			"kube-node-lease",
+			"flux-system",
+			"traefik",
+			"metallb-system",
+			"cert-manager",
+			"longhorn-system",
+			"vault",
+			"postgres",
+			"maintenance",
+		},
+		Startup: Startup{
+			APIWaitSeconds:                1200,
+			APIPollSeconds:                2,
+			ShutdownCooldownSeconds:       45,
+			RequireNodeInventoryReach:     true,
+			NodeInventoryReachWaitSeconds: 300,
+			NodeInventoryReachPollSeconds: 5,
+			RequireTimeSync:               true,
+			TimeSyncWaitSeconds:           240,
+			TimeSyncPollSeconds:           5,
+			TimeSyncMode:                  "quorum",
+			TimeSyncQuorum:                2,
+			ReconcileAccessOnBoot:         true,
+			AutoEtcdRestoreOnAPIFailure:   true,
+			EtcdRestoreControlPlane:       "titan-0a",
+			RequireStorageReady:           true,
+			StorageReadyWaitSeconds:       420,
+			StorageReadyPollSeconds:       5,
+			StorageMinReadyNodes:          2,
+			StorageCriticalPVCs: []string{
+				"vault/data-vault-0",
+				"postgres/postgres-data-postgres-0",
+				"gitea/gitea-data",
+				"sso/keycloak-data",
+			},
+			MinimumBatteryPercent: 20,
+			RequiredNodeLabels: map[string]map[string]string{
+				"titan-09": {
+					"ananke.bstein.dev/harbor-bootstrap": "true",
+				},
+			},
+			RequirePostStartProbes:    true,
+			PostStartProbeWaitSeconds: 240,
+			PostStartProbePollSeconds: 5,
+			PostStartProbes: []string{
+				"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
+				"https://scm.bstein.dev/api/healthz",
+				"https://metrics.bstein.dev/api/health",
+			},
+			RequireServiceChecklist:         true,
+			ServiceChecklistWaitSeconds:     420,
+			ServiceChecklistPollSeconds:     5,
+			ServiceChecklistStabilitySec:    120,
+			ServiceChecklist:                defaultServiceChecklist(),
+			RequireCriticalServiceEndpoints: true,
+			CriticalServiceEndpointWaitSec:  420,
+			CriticalServiceEndpointPollSec:  5,
+			CriticalServiceEndpoints:        defaultCriticalServiceEndpoints(),
+			RequireIngressChecklist:         true,
+			IngressChecklistWaitSeconds:     420,
+			IngressChecklistPollSeconds:     5,
+			IngressChecklistAccepted:        []int{200, 301, 302, 307, 308, 401, 403, 404},
+			IngressChecklistIgnoreHosts:     []string{},
+			RequireNodeSSHAuth:              true,
+			NodeSSHAuthWaitSeconds:          240,
+			NodeSSHAuthPollSeconds:          5,
+			RequireFluxHealth:               true,
+			FluxHealthWaitSeconds:           900,
+			FluxHealthPollSeconds:           5,
+			IgnoreFluxKustomizations:        []string{},
+			RequireWorkloadConvergence:      true,
+			WorkloadConvergenceWaitSeconds:  900,
+			WorkloadConvergencePollSeconds:  5,
+			IgnoreWorkloadNamespaces:        []string{},
+			IgnoreWorkloads:                 []string{},
+			IgnoreUnavailableNodes:          []string{},
+			AutoRecycleStuckPods:            true,
+			StuckPodGraceSeconds:            180,
+			VaultUnsealKeyFile:              "/var/lib/ananke/vault-unseal.key",
+			VaultUnsealBreakglassTimeout:    15,
+		},
+		Shutdown: Shutdown{
+			DefaultBudgetSeconds: 1380,
+			HistoryMinSamples:    3,
+			EmergencyBudgetSec:   420,
+			EmergencyMinSamples:  3,
+			EmergencySkipEtcd:    true,
+			EmergencySkipDrain:   true,
+			DrainParallelism:     6,
+			ScaleParallelism:     8,
+			SSHParallelism:       8,
+		},
+		UPS: UPS{
+			Enabled:                 true,
+			Provider:                "nut",
+			PollSeconds:             5,
+			RuntimeSafetyFactor:     1.25,
+			DebounceCount:           3,
+			TelemetryTimeoutSeconds: 90,
+		},
+		Coordination: Coordination{
+			ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
+			PeerHosts:             []string{},
+			FallbackLocalShutdown: true,
+			CommandTimeoutSeconds: 25,
+			StartupGuardMaxAgeSec: 900,
+			Role:                  "coordinator",
+			AllowStartupOnBattery: false,
+		},
+		Metrics: Metrics{
+			Enabled:  true,
+			BindAddr: "0.0.0.0:9560",
+			Path:     "/metrics",
+		},
+		State: State{
+			Dir:            "/var/lib/ananke",
+			ReportsDir:     "/var/lib/ananke/reports",
+			RunHistoryPath: "/var/lib/ananke/runs.json",
+			LockPath:       "/var/lib/ananke/ananke.lock",
+			IntentPath:     "/var/lib/ananke/intent.json",
+		},
+	}
+	c.applyDefaults()
+	return c
+}
diff --git a/internal/config/startup_service_catalog.go b/internal/config/startup_service_catalog.go
new file mode 100644
index 0000000..922ede8
--- /dev/null
+++ b/internal/config/startup_service_catalog.go
@@ -0,0 +1,315 @@
+package config
+
+import "strings"
+
+// defaultServiceChecklist runs one orchestration or CLI step.
+// Signature: defaultServiceChecklist() []ServiceChecklistCheck.
+// Why: startup must verify real external behavior per service (not only generic
+// ingress reachability) so false positives do not pass drills.
+func defaultServiceChecklist() []ServiceChecklistCheck {
+	return []ServiceChecklistCheck{
+		{
+			Name:             "gitea-api",
+			URL:              "https://scm.bstein.dev/api/healthz",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "pass",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "grafana-api",
+			URL:              "https://metrics.bstein.dev/api/health",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "\"database\":\"ok\"",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "keycloak-oidc",
+			URL:              "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "\"issuer\":\"https://sso.bstein.dev/realms/atlas\"",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "harbor-registry-api",
+			URL:              "https://registry.bstein.dev/v2/",
+			AcceptedStatuses: []int{401},
+			BodyContains:     "unauthorized",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "alerts-ui",
+			URL:              "https://alerts.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "Alertmanager",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "auth-gateway-redirect",
+			URL:              "https://auth.bstein.dev/",
+			AcceptedStatuses: []int{302},
+			LocationContains: "https://sso.bstein.dev/realms/atlas/",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "home-site",
+			URL:              "https://bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "Titan Lab",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "actual-budget-ui",
+			URL:              "https://budget.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "<title>Actual",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "element-call-ui",
+			URL:              "https://call.live.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "Element Call",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "flux-gitops-ui",
+			URL:              "https://cd.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "Weave GitOps",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "chat-ai-health",
+			URL:              "https://chat.ai.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "\"ok\": true",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "jenkins-auth-gate",
+			URL:              "https://ci.bstein.dev/",
+			AcceptedStatuses: []int{403},
+			BodyContains:     "commenceLogin",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "nextcloud-login-redirect",
+			URL:              "https://cloud.bstein.dev/",
+			AcceptedStatuses: []int{302},
+			LocationContains: "/index.php/login",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "wger-redirect",
+			URL:              "https://health.bstein.dev/",
+			AcceptedStatuses: []int{302},
+			LocationContains: "/en/",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "livekit-edge",
+			URL:              "https://kit.live.bstein.dev/",
+			AcceptedStatuses: []int{404},
+			BodyContains:     "404 page not found",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "element-web-ui",
+			URL:              "https://live.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "<title>Element</title>",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "logging-oidc-redirect",
+			URL:              "https://logs.bstein.dev/",
+			AcceptedStatuses: []int{302},
+			LocationContains: "client_id=logs",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "longhorn-oidc-redirect",
+			URL:              "https://longhorn.bstein.dev/",
+			AcceptedStatuses: []int{302},
+			LocationContains: "https://sso.bstein.dev/realms/atlas/",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "matrix-auth-ui",
+			URL:              "https://matrix.live.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "matrix-authentication-service",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "monero-edge",
+			URL:              "https://monero.bstein.dev/",
+			AcceptedStatuses: []int{404},
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "firefly-login-redirect",
+			URL:              "https://money.bstein.dev/",
+			AcceptedStatuses: []int{302},
+			LocationContains: "/login",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "outline-ui",
+			URL:              "https://notes.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "<title>Outline</title>",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "collabora-probe",
+			URL:              "https://office.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "OK",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "pegasus-ui",
+			URL:              "https://pegasus.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "<title>Pegasus</title>",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "harbor-ui",
+			URL:              "https://registry.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "<title>Harbor</title>",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "vault-ui-redirect",
+			URL:              "https://secret.bstein.dev/",
+			AcceptedStatuses: []int{307},
+			LocationContains: "/ui/",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "sentinel-oidc-redirect",
+			URL:              "https://sentinel.bstein.dev/",
+			AcceptedStatuses: []int{302},
+			LocationContains: "client_id=metis",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "keycloak-admin-redirect",
+			URL:              "https://sso.bstein.dev/",
+			AcceptedStatuses: []int{302},
+			LocationContains: "https://sso.bstein.dev/admin/",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "jellyfin-edge",
+			URL:              "https://stream.bstein.dev/",
+			AcceptedStatuses: []int{302},
+			LocationContains: "web/",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "planka-ui",
+			URL:              "https://tasks.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "<title>PLANKA</title>",
+			TimeoutSeconds:   12,
+		},
+		{
+			Name:             "vaultwarden-ui",
+			URL:              "https://vault.bstein.dev/",
+			AcceptedStatuses: []int{200},
+			BodyContains:     "<title>Vaultwarden Web</title>",
+			TimeoutSeconds:   12,
+		},
+	}
+}
+
+// defaultCriticalServiceEndpoints runs one orchestration or CLI step.
+// Signature: defaultCriticalServiceEndpoints() []string.
+// Why: service edge checks are insufficient for protected stacks; endpoint
+// presence verifies that backends are actually routable before startup success.
+func defaultCriticalServiceEndpoints() []string {
+	return []string{
+		"monitoring/victoria-metrics-single-server",
+		"monitoring/grafana",
+		"monitoring/kube-state-metrics",
+		"logging/oauth2-proxy-logs",
+		"logging/opensearch-dashboards",
+		"logging/opensearch-master",
+	}
+}
+
+// mergeServiceChecklistDefaults runs one orchestration or CLI step.
+// Signature: mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck.
+// Why: host configs can keep custom checks while still inheriting mandatory
+// baseline checks introduced after incident learnings.
+func mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck {
+	if len(existing) == 0 {
+		out := make([]ServiceChecklistCheck, 0, len(defaults))
+		out = append(out, defaults...)
+		return out
+	}
+
+	byName := map[string]struct{}{}
+	for _, check := range existing {
+		name := strings.TrimSpace(check.Name)
+		if name == "" {
+			continue
+		}
+		byName[name] = struct{}{}
+	}
+
+	out := make([]ServiceChecklistCheck, 0, len(existing)+len(defaults))
+	out = append(out, existing...)
+	for _, check := range defaults {
+		name := strings.TrimSpace(check.Name)
+		if name == "" {
+			continue
+		}
+		if _, exists := byName[name]; exists {
+			continue
+		}
+		out = append(out, check)
+	}
+	return out
+}
+
+// mergeStringDefaults runs one orchestration or CLI step.
+// Signature: mergeStringDefaults(existing, defaults []string) []string.
+// Why: keeps baseline startup guards applied while preserving site-specific
+// additions already declared in host configs.
+func mergeStringDefaults(existing, defaults []string) []string {
+	if len(existing) == 0 {
+		out := make([]string, 0, len(defaults))
+		out = append(out, defaults...)
+		return out
+	}
+	seen := map[string]struct{}{}
+	out := make([]string, 0, len(existing)+len(defaults))
+	for _, item := range existing {
+		key := strings.TrimSpace(item)
+		if key == "" {
+			continue
+		}
+		if _, ok := seen[key]; ok {
+			continue
+		}
+		seen[key] = struct{}{}
+		out = append(out, key)
+	}
+	for _, item := range defaults {
+		key := strings.TrimSpace(item)
+		if key == "" {
+			continue
+		}
+		if _, ok := seen[key]; ok {
+			continue
+		}
+		seen[key] = struct{}{}
+		out = append(out, key)
+	}
+	return out
+}
diff --git a/internal/config/types.go b/internal/config/types.go
new file mode 100644
index 0000000..a253c8f
--- /dev/null
+++ b/internal/config/types.go
@@ -0,0 +1,156 @@
+package config
+
+type Config struct {
+	Kubeconfig          string            `yaml:"kubeconfig"`
+	SSHUser             string            `yaml:"ssh_user"`
+	SSHPort             int               `yaml:"ssh_port"`
+	SSHConfigFile       string            `yaml:"ssh_config_file"`
+	SSHIdentityFile     string            `yaml:"ssh_identity_file"`
+	SSHNodeHosts        map[string]string `yaml:"ssh_node_hosts"`
+	SSHNodeUsers        map[string]string `yaml:"ssh_node_users"`
+	SSHManagedNodes     []string          `yaml:"ssh_managed_nodes"`
+	SSHJumpHost         string            `yaml:"ssh_jump_host"`
+	SSHJumpUser         string            `yaml:"ssh_jump_user"`
+	IACRepoPath         string            `yaml:"iac_repo_path"`
+	ExpectedFluxBranch  string            `yaml:"expected_flux_branch"`
+	ExpectedFluxSource  string            `yaml:"expected_flux_source_url"`
+	ControlPlanes       []string          `yaml:"control_planes"`
+	Workers             []string          `yaml:"workers"`
+	LocalBootstrapPaths []string          `yaml:"local_bootstrap_paths"`
+	ExcludedNamespaces  []string          `yaml:"excluded_namespaces"`
+	Startup             Startup           `yaml:"startup"`
+	Shutdown            Shutdown          `yaml:"shutdown"`
+	UPS                 UPS               `yaml:"ups"`
+	Coordination        Coordination      `yaml:"coordination"`
+	Metrics             Metrics           `yaml:"metrics"`
+	State               State             `yaml:"state"`
+}
+
+type Startup struct {
+	APIWaitSeconds                  int                          `yaml:"api_wait_seconds"`
+	APIPollSeconds                  int                          `yaml:"api_poll_seconds"`
+	ShutdownCooldownSeconds         int                          `yaml:"shutdown_cooldown_seconds"`
+	MinimumBatteryPercent           float64                      `yaml:"minimum_battery_percent"`
+	RequireNodeInventoryReach       bool                         `yaml:"require_node_inventory_reachability"`
+	NodeInventoryReachWaitSeconds   int                          `yaml:"node_inventory_reachability_wait_seconds"`
+	NodeInventoryReachPollSeconds   int                          `yaml:"node_inventory_reachability_poll_seconds"`
+	RequiredNodeLabels              map[string]map[string]string `yaml:"required_node_labels"`
+	RequireTimeSync                 bool                         `yaml:"require_time_sync"`
+	TimeSyncWaitSeconds             int                          `yaml:"time_sync_wait_seconds"`
+	TimeSyncPollSeconds             int                          `yaml:"time_sync_poll_seconds"`
+	TimeSyncMode                    string                       `yaml:"time_sync_mode"`
+	TimeSyncQuorum                  int                          `yaml:"time_sync_quorum"`
+	ReconcileAccessOnBoot           bool                         `yaml:"reconcile_access_on_boot"`
+	AutoEtcdRestoreOnAPIFailure     bool                         `yaml:"auto_etcd_restore_on_api_failure"`
+	EtcdRestoreControlPlane         string                       `yaml:"etcd_restore_control_plane"`
+	RequireStorageReady             bool                         `yaml:"require_storage_ready"`
+	StorageReadyWaitSeconds         int                          `yaml:"storage_ready_wait_seconds"`
+	StorageReadyPollSeconds         int                          `yaml:"storage_ready_poll_seconds"`
+	StorageMinReadyNodes            int                          `yaml:"storage_min_ready_nodes"`
+	StorageCriticalPVCs             []string                     `yaml:"storage_critical_pvcs"`
+	RequirePostStartProbes          bool                         `yaml:"require_post_start_probes"`
+	PostStartProbeWaitSeconds       int                          `yaml:"post_start_probe_wait_seconds"`
+	PostStartProbePollSeconds       int                          `yaml:"post_start_probe_poll_seconds"`
+	PostStartProbes                 []string                     `yaml:"post_start_probes"`
+	RequireServiceChecklist         bool                         `yaml:"require_service_checklist"`
+	ServiceChecklistWaitSeconds     int                          `yaml:"service_checklist_wait_seconds"`
+	ServiceChecklistPollSeconds     int                          `yaml:"service_checklist_poll_seconds"`
+	ServiceChecklistStabilitySec    int                          `yaml:"service_checklist_stability_seconds"`
+	ServiceChecklist                []ServiceChecklistCheck      `yaml:"service_checklist"`
+	RequireCriticalServiceEndpoints bool                         `yaml:"require_critical_service_endpoints"`
+	CriticalServiceEndpointWaitSec  int                          `yaml:"critical_service_endpoint_wait_seconds"`
+	CriticalServiceEndpointPollSec  int                          `yaml:"critical_service_endpoint_poll_seconds"`
+	CriticalServiceEndpoints        []string                     `yaml:"critical_service_endpoints"`
+	RequireIngressChecklist         bool                         `yaml:"require_ingress_checklist"`
+	IngressChecklistWaitSeconds     int                          `yaml:"ingress_checklist_wait_seconds"`
+	IngressChecklistPollSeconds     int                          `yaml:"ingress_checklist_poll_seconds"`
+	IngressChecklistAccepted        []int                        `yaml:"ingress_checklist_accepted_statuses"`
+	IngressChecklistIgnoreHosts     []string                     `yaml:"ingress_checklist_ignore_hosts"`
+	IngressChecklistInsecureSkip    bool                         `yaml:"ingress_checklist_insecure_skip_tls"`
+	RequireNodeSSHAuth              bool                         `yaml:"require_node_ssh_auth"`
+	NodeSSHAuthWaitSeconds          int                          `yaml:"node_ssh_auth_wait_seconds"`
+	NodeSSHAuthPollSeconds          int                          `yaml:"node_ssh_auth_poll_seconds"`
+	RequireFluxHealth               bool                         `yaml:"require_flux_health"`
+	FluxHealthWaitSeconds           int                          `yaml:"flux_health_wait_seconds"`
+	FluxHealthPollSeconds           int                          `yaml:"flux_health_poll_seconds"`
+	IgnoreFluxKustomizations        []string                     `yaml:"ignore_flux_kustomizations"`
+	RequireWorkloadConvergence      bool                         `yaml:"require_workload_convergence"`
+	WorkloadConvergenceWaitSeconds  int                          `yaml:"workload_convergence_wait_seconds"`
+	WorkloadConvergencePollSeconds  int                          `yaml:"workload_convergence_poll_seconds"`
+	IgnoreWorkloadNamespaces        []string                     `yaml:"ignore_workload_namespaces"`
+	IgnoreWorkloads                 []string                     `yaml:"ignore_workloads"`
+	IgnoreUnavailableNodes          []string                     `yaml:"ignore_unavailable_nodes"`
+	AutoRecycleStuckPods            bool                         `yaml:"auto_recycle_stuck_pods"`
+	StuckPodGraceSeconds            int                          `yaml:"stuck_pod_grace_seconds"`
+	VaultUnsealKeyFile              string                       `yaml:"vault_unseal_key_file"`
+	VaultUnsealBreakglassCommand    string                       `yaml:"vault_unseal_breakglass_command"`
+	VaultUnsealBreakglassTimeout    int                          `yaml:"vault_unseal_breakglass_timeout_seconds"`
+}
+
+type ServiceChecklistCheck struct {
+	Name                string `yaml:"name"`
+	URL                 string `yaml:"url"`
+	AcceptedStatuses    []int  `yaml:"accepted_statuses"`
+	LocationContains    string `yaml:"location_contains"`
+	LocationNotContains string `yaml:"location_not_contains"`
+	BodyContains        string `yaml:"body_contains"`
+	BodyNotContains     string `yaml:"body_not_contains"`
+	TimeoutSeconds      int    `yaml:"timeout_seconds"`
+	InsecureSkipTLS     bool   `yaml:"insecure_skip_tls"`
+}
+
+type Shutdown struct {
+	DefaultBudgetSeconds int  `yaml:"default_budget_seconds"`
+	HistoryMinSamples    int  `yaml:"history_min_samples"`
+	EmergencyBudgetSec   int  `yaml:"emergency_budget_seconds"`
+	EmergencyMinSamples  int  `yaml:"emergency_history_min_samples"`
+	EmergencySkipEtcd    bool `yaml:"emergency_skip_etcd_snapshot"`
+	EmergencySkipDrain   bool `yaml:"emergency_skip_drain"`
+	SkipEtcdSnapshot     bool `yaml:"skip_etcd_snapshot"`
+	SkipDrain            bool `yaml:"skip_drain"`
+	DrainParallelism     int  `yaml:"drain_parallelism"`
+	ScaleParallelism     int  `yaml:"scale_parallelism"`
+	SSHParallelism       int  `yaml:"ssh_parallelism"`
+}
+
+type UPS struct {
+	Enabled                 bool        `yaml:"enabled"`
+	Provider                string      `yaml:"provider"`
+	Target                  string      `yaml:"target"`
+	Targets                 []UPSTarget `yaml:"targets"`
+	PollSeconds             int         `yaml:"poll_seconds"`
+	RuntimeSafetyFactor     float64     `yaml:"runtime_safety_factor"`
+	DebounceCount           int         `yaml:"debounce_count"`
+	TelemetryTimeoutSeconds int         `yaml:"telemetry_timeout_seconds"`
+}
+
+type UPSTarget struct {
+	Name   string `yaml:"name"`
+	Target string `yaml:"target"`
+}
+
+type Coordination struct {
+	ForwardShutdownHost   string   `yaml:"forward_shutdown_host"`
+	ForwardShutdownUser   string   `yaml:"forward_shutdown_user"`
+	ForwardShutdownConfig string   `yaml:"forward_shutdown_config"`
+	PeerHosts             []string `yaml:"peer_hosts"`
+	FallbackLocalShutdown bool     `yaml:"fallback_local_shutdown"`
+	CommandTimeoutSeconds int      `yaml:"command_timeout_seconds"`
+	StartupGuardMaxAgeSec int      `yaml:"startup_guard_max_age_seconds"`
+	Role                  string   `yaml:"role"`
+	AllowStartupOnBattery bool     `yaml:"allow_startup_on_battery"`
+}
+
+type Metrics struct {
+	Enabled  bool   `yaml:"enabled"`
+	BindAddr string `yaml:"bind_addr"`
+	Path     string `yaml:"path"`
+}
+
+type State struct {
+	Dir            string `yaml:"dir"`
+	ReportsDir     string `yaml:"reports_dir"`
+	RunHistoryPath string `yaml:"run_history_path"`
+	LockPath       string `yaml:"lock_path"`
+	IntentPath     string `yaml:"intent_path"`
+}