startup: enforce external service behavior checks

2026-04-08 23:42:09 -03:00 · 2026-04-08 23:42:09 -03:00 · 95fefba244
commit 95fefba244
parent 296ca85c78
8 changed files with 1615 additions and 67 deletions
--- a/internal/cluster/orchestrator_core.go
+++ b/internal/cluster/orchestrator_core.go
@ -0,0 +1,124 @@
 package cluster
 import (
 	"context"
 	"errors"
 	"log"
 	"regexp"
 	"sync"
 	"time"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 	"scm.bstein.dev/bstein/ananke/internal/execx"
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
 type Orchestrator struct {
 	cfg                  config.Config
 	runner               *execx.Runner
 	store                *state.Store
 	log                  *log.Logger
 	runOverride          func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
 	runSensitiveOverride func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
 	startupReportMu      sync.Mutex
 	activeStartupReport  *startupReport
 }
 type commandOverrideFunc func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
 type StartupOptions struct {
 	ForceFluxBranch    string
 	SkipLocalBootstrap bool
 	Reason             string
 }
 type ShutdownOptions struct {
 	SkipEtcdSnapshot bool
 	SkipDrain        bool
 	Mode             string
 	Reason           string
 }
 type EtcdRestoreOptions struct {
 	ControlPlane string
 	SnapshotPath string
 }
 type startupWorkload struct {
 	Namespace string
 	Kind      string
 	Name      string
 }
 type workloadScaleEntry struct {
 	Namespace string `json:"namespace"`
 	Kind      string `json:"kind"`
 	Name      string `json:"name"`
 	Replicas  int    `json:"replicas"`
 }
 type remotePeerStatus struct {
 	Intent          state.Intent
 	BootstrapActive bool
 }
 type workloadScaleSnapshot struct {
 	GeneratedAt time.Time            `json:"generated_at"`
 	Entries     []workloadScaleEntry `json:"entries"`
 }
 type startupReport struct {
 	StartedAt   time.Time                     `json:"started_at"`
 	Completed   time.Time                     `json:"completed_at"`
 	Reason      string                        `json:"reason"`
 	Status      string                        `json:"status"`
 	Phase       string                        `json:"phase"`
 	Success     bool                          `json:"success"`
 	Error       string                        `json:"error,omitempty"`
 	Checks      map[string]startupCheckRecord `json:"checks"`
 	AutoHeals   []string                      `json:"auto_heals"`
 	SourceHost  string                        `json:"source_host"`
 	LastUpdated time.Time                     `json:"last_updated"`
 }
 type startupCheckRecord struct {
 	Status    string    `json:"status"`
 	Detail    string    `json:"detail"`
 	UpdatedAt time.Time `json:"updated_at"`
 }
 var datastoreEndpointPattern = regexp.MustCompile(`--datastore-endpoint(?:=|\s+)(?:'([^']+)'|"([^"]+)"|([^\s\\]+))`)
 var criticalStartupWorkloads = []startupWorkload{
 	{Namespace: "flux-system", Kind: "deployment", Name: "source-controller"},
 	{Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"},
 	{Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"},
 	{Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"},
 	{Namespace: "vault", Kind: "statefulset", Name: "vault"},
 	{Namespace: "postgres", Kind: "statefulset", Name: "postgres"},
 	{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
 	{Namespace: "monitoring", Kind: "deployment", Name: "grafana"},
 	{Namespace: "monitoring", Kind: "statefulset", Name: "victoria-metrics-single-server"},
 	{Namespace: "monitoring", Kind: "deployment", Name: "kube-state-metrics"},
 	{Namespace: "logging", Kind: "deployment", Name: "oauth2-proxy-logs"},
 	{Namespace: "logging", Kind: "deployment", Name: "opensearch-dashboards"},
 	{Namespace: "logging", Kind: "statefulset", Name: "opensearch"},
 }
 var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable")
 // New runs one orchestration or CLI step.
 // Signature: New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
 	return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
 }
 // SetCommandOverrides runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) SetCommandOverrides(run commandOverrideFunc, runSensitive commandOverrideFunc).
 // Why: enables deterministic integration testing from the top-level testing module
 // without requiring package-local test files or live cluster dependencies.
 func (o *Orchestrator) SetCommandOverrides(run commandOverrideFunc, runSensitive commandOverrideFunc) {
 	o.runOverride = run
 	o.runSensitiveOverride = runSensitive
 }
--- a/internal/cluster/orchestrator_service_stability.go
+++ b/internal/cluster/orchestrator_service_stability.go
@ -0,0 +1,389 @@
 package cluster
 import (
 	"context"
 	"crypto/tls"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
 	"strings"
 	"time"
 	"unicode"
 	"scm.bstein.dev/bstein/ananke/internal/config"
 )
 // isLikelyHostname runs one orchestration or CLI step.
 // Signature: isLikelyHostname(value string) bool.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func isLikelyHostname(value string) bool {
 	value = strings.TrimSpace(value)
 	if value == "" {
 		return false
 	}
 	if strings.Contains(value, " ") || strings.Contains(value, "/") {
 		return false
 	}
 	return strings.Contains(value, ".")
 }
 // healIngressHostBackendReplicas runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) healIngressHostBackendReplicas(ctx context.Context, host string) ([]string, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (o *Orchestrator) healIngressHostBackendReplicas(ctx context.Context, host string) ([]string, error) {
 	namespaces, err := o.discoverIngressNamespacesForHost(ctx, host)
 	if err != nil {
 		return nil, err
 	}
 	if len(namespaces) == 0 {
 		return nil, nil
 	}
 	targetNamespaces := makeStringSet(namespaces)
 	out, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
 	if err != nil {
 		return nil, fmt.Errorf("query workloads: %w", err)
 	}
 	var list workloadList
 	if err := json.Unmarshal([]byte(out), &list); err != nil {
 		return nil, fmt.Errorf("decode workloads: %w", err)
 	}
 	healed := []string{}
 	for _, item := range list.Items {
 		kind := strings.ToLower(strings.TrimSpace(item.Kind))
 		ns := strings.TrimSpace(item.Metadata.Namespace)
 		name := strings.TrimSpace(item.Metadata.Name)
 		if kind == "" || ns == "" || name == "" {
 			continue
 		}
 		if kind != "deployment" && kind != "statefulset" {
 			continue
 		}
 		if _, ok := targetNamespaces[ns]; !ok {
 			continue
 		}
 		desired := int32(1)
 		if item.Spec.Replicas != nil {
 			desired = *item.Spec.Replicas
 		}
 		if desired >= 1 {
 			continue
 		}
 		workload := startupWorkload{Namespace: ns, Kind: kind, Name: name}
 		if err := o.ensureWorkloadReplicas(ctx, workload, 1); err != nil {
 			if isNotFoundErr(err) {
 				continue
 			}
 			return healed, fmt.Errorf("scale %s/%s/%s to 1: %w", ns, kind, name, err)
 		}
 		healed = append(healed, ns+"/"+kind+"/"+name)
 	}
 	return healed, nil
 }
 // waitForServiceChecklist runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error {
 	wait := time.Duration(o.cfg.Startup.ServiceChecklistWaitSeconds) * time.Second
 	if wait <= 0 {
 		wait = 7 * time.Minute
 	}
 	poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second
 	if poll <= 0 {
 		poll = 5 * time.Second
 	}
 	deadline := time.Now().Add(wait)
 	lastFailure := "unknown"
 	lastLogged := time.Time{}
 	lastRecycleAttempt := time.Time{}
 	lastReplicaHeal := time.Time{}
 	lastIngressHeal := time.Time{}
 	for {
 		o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
 		o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
 		prevFailure := lastFailure
 		ready, detail := o.serviceChecklistReady(ctx)
 		lastFailure = detail
 		if ready {
 			o.log.Printf("external service checklist passed (%s)", detail)
 			return nil
 		}
 		o.maybeAutoHealIngressHostBackends(ctx, &lastIngressHeal, lastFailure)
 		if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second {
 			remaining := time.Until(deadline).Round(time.Second)
 			if remaining < 0 {
 				remaining = 0
 			}
 			o.log.Printf("waiting for external service checklist (%s remaining): %s", remaining, lastFailure)
 			lastLogged = time.Now()
 		}
 		if time.Now().After(deadline) {
 			return fmt.Errorf("startup blocked: external service checklist not satisfied within %s (%s)", wait, lastFailure)
 		}
 		select {
 		case <-ctx.Done():
 			return ctx.Err()
 		case <-time.After(poll):
 		}
 	}
 }
 // serviceChecklistReady runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) serviceChecklistReady(ctx context.Context) (bool, string).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (o *Orchestrator) serviceChecklistReady(ctx context.Context) (bool, string) {
 	checks := o.cfg.Startup.ServiceChecklist
 	if len(checks) == 0 {
 		return true, "no checklist items configured"
 	}
 	for _, check := range checks {
 		ok, detail := o.serviceCheckReady(ctx, check)
 		if !ok {
 			name := strings.TrimSpace(check.Name)
 			if name == "" {
 				name = strings.TrimSpace(check.URL)
 			}
 			return false, fmt.Sprintf("%s: %s", name, detail)
 		}
 	}
 	return true, fmt.Sprintf("checks=%d", len(checks))
 }
 // serviceCheckReady runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.ServiceChecklistCheck) (bool, string).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.ServiceChecklistCheck) (bool, string) {
 	result, err := o.httpChecklistProbeResult(ctx, check)
 	if err != nil {
 		return false, err.Error()
 	}
 	accepted := check.AcceptedStatuses
 	if len(accepted) == 0 {
 		accepted = []int{200, 201, 202, 203, 204, 301, 302, 303, 307, 308, 401, 403}
 	}
 	statusOk := false
 	for _, code := range accepted {
 		if result.Status == code {
 			statusOk = true
 			break
 		}
 	}
 	if !statusOk {
 		return false, fmt.Sprintf("unexpected status code=%d", result.Status)
 	}
 	locationContains := strings.TrimSpace(check.LocationContains)
 	if locationContains != "" && !checklistContains(result.Location, locationContains) {
 		return false, fmt.Sprintf("location header missing expected marker %q", locationContains)
 	}
 	locationNotContains := strings.TrimSpace(check.LocationNotContains)
 	if locationNotContains != "" && checklistContains(result.Location, locationNotContains) {
 		return false, fmt.Sprintf("location header contained forbidden marker %q", locationNotContains)
 	}
 	bodyContains := strings.TrimSpace(check.BodyContains)
 	if bodyContains != "" && !checklistContains(result.Body, bodyContains) {
 		return false, fmt.Sprintf("response missing expected marker %q", bodyContains)
 	}
 	bodyNotContains := strings.TrimSpace(check.BodyNotContains)
 	if bodyNotContains != "" && checklistContains(result.Body, bodyNotContains) {
 		return false, fmt.Sprintf("response contained forbidden marker %q", bodyNotContains)
 	}
 	return true, fmt.Sprintf("status=%d", result.Status)
 }
 type checklistHTTPProbeResult struct {
 	Status   int
 	Body     string
 	Location string
 }
 // httpChecklistProbeResult runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error).
 // Why: checklist checks need response headers (for redirect verification) in
 // addition to status/body so startup can validate real user-facing behavior.
 func (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error) {
 	result := checklistHTTPProbeResult{}
 	status, body, location, err := o.httpChecklistProbeWithLocation(ctx, check)
 	if err != nil {
 		return result, err
 	}
 	result.Status = status
 	result.Body = body
 	result.Location = location
 	return result, nil
 }
 // httpChecklistProbe runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error) {
 	status, body, _, err := o.httpChecklistProbeWithLocation(ctx, check)
 	return status, body, err
 }
 // httpChecklistProbeWithLocation runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error).
 // Why: redirects and auth gates require location-header assertions to prevent
 // startup false-positives on partially healthy protected services.
 func (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error) {
 	timeout := time.Duration(check.TimeoutSeconds) * time.Second
 	if timeout <= 0 {
 		timeout = 12 * time.Second
 	}
 	transport := &http.Transport{}
 	if check.InsecureSkipTLS {
 		transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
 	}
 	client := &http.Client{
 		Timeout:   timeout,
 		Transport: transport,
 		CheckRedirect: func(_ *http.Request, _ []*http.Request) error {
 			return http.ErrUseLastResponse
 		},
 	}
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimSpace(check.URL), nil)
 	if err != nil {
 		return 0, "", "", fmt.Errorf("build request: %w", err)
 	}
 	req.Header.Set("User-Agent", "ananke/startup-checklist")
 	resp, err := client.Do(req)
 	if err != nil {
 		return 0, "", "", fmt.Errorf("request failed: %w", err)
 	}
 	defer resp.Body.Close()
 	body, readErr := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
 	if readErr != nil {
 		return resp.StatusCode, "", "", fmt.Errorf("read response body: %w", readErr)
 	}
 	return resp.StatusCode, string(body), strings.TrimSpace(resp.Header.Get("Location")), nil
 }
 // checklistContains runs one orchestration or CLI step.
 // Signature: checklistContains(body, marker string) bool.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func checklistContains(body, marker string) bool {
 	bodyLower := strings.ToLower(body)
 	markerLower := strings.ToLower(marker)
 	if strings.Contains(bodyLower, markerLower) {
 		return true
 	}
 	bodyCompact := compactLowerNoSpace(bodyLower)
 	markerCompact := compactLowerNoSpace(markerLower)
 	if markerCompact == "" {
 		return true
 	}
 	return strings.Contains(bodyCompact, markerCompact)
 }
 // compactLowerNoSpace runs one orchestration or CLI step.
 // Signature: compactLowerNoSpace(s string) string.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func compactLowerNoSpace(s string) string {
 	var b strings.Builder
 	b.Grow(len(s))
 	for _, r := range s {
 		if unicode.IsSpace(r) {
 			continue
 		}
 		b.WriteRune(r)
 	}
 	return b.String()
 }
 // waitForStabilityWindow runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
 	window := time.Duration(o.cfg.Startup.ServiceChecklistStabilitySec) * time.Second
 	if window <= 0 {
 		return nil
 	}
 	poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second
 	if poll <= 0 {
 		poll = 5 * time.Second
 	}
 	deadline := time.Now().Add(window)
 	lastStatus := time.Time{}
 	lastRecycleAttempt := time.Time{}
 	lastReplicaHeal := time.Time{}
 	for {
 		o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
 		o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
 		if err := o.startupStabilityHealthy(ctx); err != nil {
 			return fmt.Errorf("startup stability window failed: %w", err)
 		}
 		if time.Now().After(deadline) {
 			o.log.Printf("startup stability window passed (%s)", window)
 			return nil
 		}
 		if time.Since(lastStatus) >= 30*time.Second {
 			remaining := time.Until(deadline).Round(time.Second)
 			if remaining < 0 {
 				remaining = 0
 			}
 			o.log.Printf("startup stability soak in progress (%s remaining)", remaining)
 			lastStatus = time.Now()
 		}
 		select {
 		case <-ctx.Done():
 			return ctx.Err()
 		case <-time.After(poll):
 		}
 	}
 }
 // startupStabilityHealthy runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) startupStabilityHealthy(ctx context.Context) error.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (o *Orchestrator) startupStabilityHealthy(ctx context.Context) error {
 	if o.cfg.Startup.RequireFluxHealth {
 		ready, detail, err := o.fluxHealthReady(ctx)
 		if err != nil {
 			return fmt.Errorf("flux check error: %w", err)
 		}
 		if !ready {
 			return fmt.Errorf("flux not ready: %s", detail)
 		}
 	}
 	if o.cfg.Startup.RequireWorkloadConvergence {
 		ready, detail, err := o.workloadConvergenceReady(ctx)
 		if err != nil {
 			return fmt.Errorf("workload check error: %w", err)
 		}
 		if !ready {
 			return fmt.Errorf("workloads not converged: %s", detail)
 		}
 	}
 	if o.cfg.Startup.RequireServiceChecklist {
 		ready, detail := o.serviceChecklistReady(ctx)
 		if !ready {
 			return fmt.Errorf("external services not healthy: %s", detail)
 		}
 	}
 	if o.cfg.Startup.RequireIngressChecklist {
 		ready, detail := o.ingressChecklistReady(ctx)
 		if !ready {
 			return fmt.Errorf("ingress reachability not healthy: %s", detail)
 		}
 	}
 	failures, err := o.startupFailurePods(ctx)
 	if err != nil {
 		return fmt.Errorf("pod failure check error: %w", err)
 	}
 	if len(failures) > 0 {
 		return fmt.Errorf("pods in crash/image-pull failures: %s", joinLimited(failures, 8))
 	}
 	return nil
 }
--- a/internal/cluster/orchestrator_test.go
+++ b/internal/cluster/orchestrator_test.go
@ -15,6 +15,9 @@ import (
 	"scm.bstein.dev/bstein/ananke/internal/state"
 )
 // TestParseVaultSealed runs one orchestration or CLI step.
 // Signature: TestParseVaultSealed(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseVaultSealed(t *testing.T) {
 	sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`)
 	if err != nil {
@ -33,12 +36,18 @@ func TestParseVaultSealed(t *testing.T) {
 	}
 }
 // TestParseVaultSealedRejectsEmpty runs one orchestration or CLI step.
 // Signature: TestParseVaultSealedRejectsEmpty(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseVaultSealedRejectsEmpty(t *testing.T) {
 	if _, err := parseVaultSealed("   "); err == nil {
 		t.Fatalf("expected parse error for empty status payload")
 	}
 }
 // TestParseVaultSealedWithKubectlPreamble runs one orchestration or CLI step.
 // Signature: TestParseVaultSealedWithKubectlPreamble(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
 	raw := "Defaulted container \"vault\" out of: vault, setup-config (init)\n{\"sealed\":true,\"initialized\":true}\n"
 	sealed, err := parseVaultSealed(raw)
@ -50,6 +59,9 @@ func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
 	}
 }
 // TestFallbackWorkersFromInventoryUsesManagedNodes runs one orchestration or CLI step.
 // Signature: TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
@ -70,6 +82,9 @@ func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
 	}
 }
 // TestFallbackWorkersFromInventoryFallsBackToHosts runs one orchestration or CLI step.
 // Signature: TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
@ -89,12 +104,18 @@ func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
 	}
 }
 // TestIntentFreshTreatsZeroTimestampAsFresh runs one orchestration or CLI step.
 // Signature: TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T) {
 	if !intentFresh(state.Intent{}, 30*time.Second) {
 		t.Fatalf("zero updated_at intent should be treated as fresh")
 	}
 }
 // TestIntentFreshRespectsAge runs one orchestration or CLI step.
 // Signature: TestIntentFreshRespectsAge(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestIntentFreshRespectsAge(t *testing.T) {
 	stale := state.Intent{UpdatedAt: time.Now().Add(-2 * time.Minute)}
 	fresh := state.Intent{UpdatedAt: time.Now().Add(-20 * time.Second)}
@ -106,6 +127,9 @@ func TestIntentFreshRespectsAge(t *testing.T) {
 	}
 }
 // TestCoordinationPeersDedupesAndIncludesForwardHost runs one orchestration or CLI step.
 // Signature: TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
@ -122,6 +146,9 @@ func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
 	}
 }
 // TestWorkloadTargetsIgnoredNodesByNodeSelector runs one orchestration or CLI step.
 // Signature: TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
 	spec := podSpec{
 		NodeSelector: map[string]string{
@ -134,6 +161,9 @@ func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
 	}
 }
 // TestParseWorkloadIgnoreRules runs one orchestration or CLI step.
 // Signature: TestParseWorkloadIgnoreRules(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseWorkloadIgnoreRules(t *testing.T) {
 	rules := parseWorkloadIgnoreRules([]string{
 		"maintenance/metis",
@ -153,6 +183,9 @@ func TestParseWorkloadIgnoreRules(t *testing.T) {
 	}
 }
 // TestNamespaceCandidatesFromIgnoreKustomizations runs one orchestration or CLI step.
 // Signature: TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
 	got := namespaceCandidatesFromIgnoreKustomizations([]string{
 		"flux-system/jellyfin",
@ -166,12 +199,18 @@ func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
 	}
 }
 // TestProbeStatusAcceptedRejects404 runs one orchestration or CLI step.
 // Signature: TestProbeStatusAcceptedRejects404(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestProbeStatusAcceptedRejects404(t *testing.T) {
 	if probeStatusAccepted("https://metrics.bstein.dev/login", 404) {
 		t.Fatalf("expected 404 probe status to be rejected")
 	}
 }
 // TestParseFluxKustomizationTimeout runs one orchestration or CLI step.
 // Signature: TestParseFluxKustomizationTimeout(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestParseFluxKustomizationTimeout(t *testing.T) {
 	if got := parseFluxKustomizationTimeout("30m"); got != 30*time.Minute {
 		t.Fatalf("expected 30m duration, got %s", got)
@ -187,6 +226,9 @@ func TestParseFluxKustomizationTimeout(t *testing.T) {
 	}
 }
 // TestServiceCheckReadyRequiresBodyContains runs one orchestration or CLI step.
 // Signature: TestServiceCheckReadyRequiresBodyContains(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
@ -209,6 +251,9 @@ func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
 	}
 }
 // TestServiceCheckReadyBodyContainsIgnoresWhitespace runs one orchestration or CLI step.
 // Signature: TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
@ -231,6 +276,62 @@ func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
 	}
 }
 // TestServiceCheckReadyRequiresLocationContains runs one orchestration or CLI step.
 // Signature: TestServiceCheckReadyRequiresLocationContains(t *testing.T).
 // Why: startup checks must validate redirect targets for OIDC-gated services.
 func TestServiceCheckReadyRequiresLocationContains(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=logs")
 		w.WriteHeader(http.StatusFound)
 	}))
 	defer srv.Close()
 	orch := &Orchestrator{
 		log: log.New(os.Stdout, "", 0),
 	}
 	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
 		Name:             "logging-oidc-redirect",
 		URL:              srv.URL,
 		AcceptedStatuses: []int{302},
 		LocationContains: "client_id=logs",
 		TimeoutSeconds:   5,
 	})
 	if !ok {
 		t.Fatalf("expected location-aware service check to pass, detail=%s", detail)
 	}
 }
 // TestServiceCheckReadyRejectsMissingLocationMarker runs one orchestration or CLI step.
 // Signature: TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T).
 // Why: prevents false positives when redirects point somewhere unexpected.
 func TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T) {
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=wrong")
 		w.WriteHeader(http.StatusFound)
 	}))
 	defer srv.Close()
 	orch := &Orchestrator{
 		log: log.New(os.Stdout, "", 0),
 	}
 	ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
 		Name:             "logging-oidc-redirect",
 		URL:              srv.URL,
 		AcceptedStatuses: []int{302},
 		LocationContains: "client_id=logs",
 		TimeoutSeconds:   5,
 	})
 	if ok {
 		t.Fatalf("expected location-aware service check to fail")
 	}
 	if !strings.Contains(detail, "location header missing expected marker") {
 		t.Fatalf("expected missing location marker detail, got %q", detail)
 	}
 }
 // TestChecklistFailureHostFromIngressDetail runs one orchestration or CLI step.
 // Signature: TestChecklistFailureHostFromIngressDetail(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestChecklistFailureHostFromIngressDetail(t *testing.T) {
 	orch := &Orchestrator{}
 	got := orch.checklistFailureHost("cloud.bstein.dev: unexpected status code=500")
@ -239,6 +340,9 @@ func TestChecklistFailureHostFromIngressDetail(t *testing.T) {
 	}
 }
 // TestChecklistFailureHostFromServiceCheckName runs one orchestration or CLI step.
 // Signature: TestChecklistFailureHostFromServiceCheckName(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestChecklistFailureHostFromServiceCheckName(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
@ -258,6 +362,9 @@ func TestChecklistFailureHostFromServiceCheckName(t *testing.T) {
 	}
 }
 // TestChecklistFailureHostUnknown runs one orchestration or CLI step.
 // Signature: TestChecklistFailureHostUnknown(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestChecklistFailureHostUnknown(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
@ -279,6 +386,9 @@ func TestChecklistFailureHostUnknown(t *testing.T) {
 	}
 }
 // TestStuckVaultInitReasonDetectsHungInit runs one orchestration or CLI step.
 // Signature: TestStuckVaultInitReasonDetectsHungInit(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
 	var pod podResource
 	pod.Status.Phase = "Pending"
@ -302,6 +412,9 @@ func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
 	}
 }
 // TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods runs one orchestration or CLI step.
 // Signature: TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
 	var pod podResource
 	pod.Status.Phase = "Pending"
@ -328,70 +441,3 @@ func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
 		t.Fatalf("expected no reason for non-vault pod, got %q", reason)
 	}
 }
 func TestValidateNodeInventoryPassesForStrictMappings(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
 			SSHUser: "atlas",
 			SSHPort: 2277,
 			SSHNodeHosts: map[string]string{
 				"titan-0a": "192.168.22.11",
 				"titan-0b": "192.168.22.12",
 				"titan-0c": "192.168.22.13",
 				"titan-22": "192.168.22.22",
 			},
 			SSHManagedNodes: []string{"titan-0a", "titan-0b", "titan-0c", "titan-22"},
 			ControlPlanes:   []string{"titan-0a", "titan-0b", "titan-0c"},
 			Workers:         []string{"titan-22"},
 		},
 		log: log.New(os.Stdout, "", 0),
 	}
 	if err := orch.validateNodeInventory(); err != nil {
 		t.Fatalf("expected inventory to pass, got error: %v", err)
 	}
 }
 func TestValidateNodeInventoryFailsWhenNodeMappingMissing(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
 			SSHUser:         "atlas",
 			SSHPort:         2277,
 			SSHNodeHosts:    map[string]string{"titan-0a": "192.168.22.11"},
 			SSHManagedNodes: []string{"titan-0a", "titan-0b"},
 			ControlPlanes:   []string{"titan-0a"},
 			Workers:         []string{"titan-0b"},
 		},
 		log: log.New(os.Stdout, "", 0),
 	}
 	err := orch.validateNodeInventory()
 	if err == nil {
 		t.Fatalf("expected inventory error for missing mapping")
 	}
 	if !strings.Contains(err.Error(), "missing ssh_node_hosts entry") {
 		t.Fatalf("expected missing-mapping detail, got: %v", err)
 	}
 }
 func TestValidateNodeInventoryFailsWhenWorkerNotManaged(t *testing.T) {
 	orch := &Orchestrator{
 		cfg: config.Config{
 			SSHUser: "atlas",
 			SSHPort: 2277,
 			SSHNodeHosts: map[string]string{
 				"titan-0a": "192.168.22.11",
 				"titan-22": "192.168.22.22",
 			},
 			SSHManagedNodes: []string{"titan-0a"},
 			ControlPlanes:   []string{"titan-0a"},
 			Workers:         []string{"titan-22"},
 		},
 		log: log.New(os.Stdout, "", 0),
 	}
 	err := orch.validateNodeInventory()
 	if err == nil {
 		t.Fatalf("expected inventory error for unmanaged worker")
 	}
 	if !strings.Contains(err.Error(), "missing from ssh_managed_nodes") {
 		t.Fatalf("expected unmanaged-worker detail, got: %v", err)
 	}
 }
--- a/internal/config/apply_defaults.go
+++ b/internal/config/apply_defaults.go
@ -0,0 +1,236 @@
 package config
 import "strings"
 // applyDefaults runs one orchestration or CLI step.
 // Signature: (c *Config) applyDefaults().
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func (c *Config) applyDefaults() {
 	if c.ExpectedFluxBranch == "" {
 		c.ExpectedFluxBranch = "main"
 	}
 	if c.IACRepoPath == "" {
 		c.IACRepoPath = "/opt/titan-iac"
 	}
 	if c.ExpectedFluxSource == "" {
 		c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
 	}
 	if c.Startup.APIWaitSeconds <= 0 {
 		c.Startup.APIWaitSeconds = 1200
 	}
 	if c.Startup.APIPollSeconds <= 0 {
 		c.Startup.APIPollSeconds = 2
 	}
 	if c.Startup.ShutdownCooldownSeconds <= 0 {
 		c.Startup.ShutdownCooldownSeconds = 45
 	}
 	if c.Startup.MinimumBatteryPercent <= 0 {
 		c.Startup.MinimumBatteryPercent = 20
 	}
 	if c.Startup.NodeInventoryReachWaitSeconds <= 0 {
 		c.Startup.NodeInventoryReachWaitSeconds = 300
 	}
 	if c.Startup.NodeInventoryReachPollSeconds <= 0 {
 		c.Startup.NodeInventoryReachPollSeconds = 5
 	}
 	if c.Startup.RequiredNodeLabels == nil {
 		c.Startup.RequiredNodeLabels = map[string]map[string]string{
 			"titan-09": {
 				"ananke.bstein.dev/harbor-bootstrap": "true",
 			},
 		}
 	}
 	if c.Startup.TimeSyncWaitSeconds <= 0 {
 		c.Startup.TimeSyncWaitSeconds = 240
 	}
 	if c.Startup.TimeSyncPollSeconds <= 0 {
 		c.Startup.TimeSyncPollSeconds = 5
 	}
 	if c.Startup.TimeSyncMode == "" {
 		c.Startup.TimeSyncMode = "quorum"
 	}
 	if c.Startup.TimeSyncQuorum <= 0 {
 		c.Startup.TimeSyncQuorum = 2
 	}
 	if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 {
 		c.Startup.TimeSyncQuorum = len(c.ControlPlanes)
 	}
 	if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
 		c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
 	}
 	if c.Startup.StorageReadyWaitSeconds <= 0 {
 		c.Startup.StorageReadyWaitSeconds = 420
 	}
 	if c.Startup.StorageReadyPollSeconds <= 0 {
 		c.Startup.StorageReadyPollSeconds = 5
 	}
 	if c.Startup.StorageMinReadyNodes <= 0 {
 		c.Startup.StorageMinReadyNodes = 2
 	}
 	if len(c.Startup.StorageCriticalPVCs) == 0 {
 		c.Startup.StorageCriticalPVCs = []string{
 			"vault/data-vault-0",
 			"postgres/postgres-data-postgres-0",
 			"gitea/gitea-data",
 			"sso/keycloak-data",
 		}
 	}
 	if c.Startup.PostStartProbeWaitSeconds <= 0 {
 		c.Startup.PostStartProbeWaitSeconds = 240
 	}
 	if c.Startup.PostStartProbePollSeconds <= 0 {
 		c.Startup.PostStartProbePollSeconds = 5
 	}
 	if len(c.Startup.PostStartProbes) == 0 {
 		c.Startup.PostStartProbes = []string{
 			"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
 			"https://scm.bstein.dev/api/healthz",
 			"https://metrics.bstein.dev/api/health",
 		}
 	}
 	if c.Startup.ServiceChecklistWaitSeconds <= 0 {
 		c.Startup.ServiceChecklistWaitSeconds = 420
 	}
 	if c.Startup.ServiceChecklistPollSeconds <= 0 {
 		c.Startup.ServiceChecklistPollSeconds = 5
 	}
 	if c.Startup.ServiceChecklistStabilitySec < 0 {
 		c.Startup.ServiceChecklistStabilitySec = 0
 	}
 	c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
 	for i := range c.Startup.ServiceChecklist {
 		if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
 			c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
 		}
 	}
 	if c.Startup.CriticalServiceEndpointWaitSec <= 0 {
 		c.Startup.CriticalServiceEndpointWaitSec = 420
 	}
 	if c.Startup.CriticalServiceEndpointPollSec <= 0 {
 		c.Startup.CriticalServiceEndpointPollSec = 5
 	}
 	c.Startup.CriticalServiceEndpoints = mergeStringDefaults(c.Startup.CriticalServiceEndpoints, defaultCriticalServiceEndpoints())
 	if c.Startup.IngressChecklistWaitSeconds <= 0 {
 		c.Startup.IngressChecklistWaitSeconds = 420
 	}
 	if c.Startup.IngressChecklistPollSeconds <= 0 {
 		c.Startup.IngressChecklistPollSeconds = 5
 	}
 	if len(c.Startup.IngressChecklistAccepted) == 0 {
 		c.Startup.IngressChecklistAccepted = []int{200, 301, 302, 307, 308, 401, 403, 404}
 	}
 	if c.Startup.IngressChecklistIgnoreHosts == nil {
 		c.Startup.IngressChecklistIgnoreHosts = []string{}
 	}
 	if c.Startup.NodeSSHAuthWaitSeconds <= 0 {
 		c.Startup.NodeSSHAuthWaitSeconds = 240
 	}
 	if c.Startup.NodeSSHAuthPollSeconds <= 0 {
 		c.Startup.NodeSSHAuthPollSeconds = 5
 	}
 	if c.Startup.FluxHealthWaitSeconds <= 0 {
 		c.Startup.FluxHealthWaitSeconds = 900
 	}
 	if c.Startup.FluxHealthPollSeconds <= 0 {
 		c.Startup.FluxHealthPollSeconds = 5
 	}
 	if c.Startup.IgnoreFluxKustomizations == nil {
 		c.Startup.IgnoreFluxKustomizations = []string{}
 	}
 	if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
 		c.Startup.WorkloadConvergenceWaitSeconds = 900
 	}
 	if c.Startup.WorkloadConvergencePollSeconds <= 0 {
 		c.Startup.WorkloadConvergencePollSeconds = 5
 	}
 	if c.Startup.IgnoreWorkloadNamespaces == nil {
 		c.Startup.IgnoreWorkloadNamespaces = []string{}
 	}
 	if c.Startup.IgnoreWorkloads == nil {
 		c.Startup.IgnoreWorkloads = []string{}
 	}
 	if c.Startup.IgnoreUnavailableNodes == nil {
 		c.Startup.IgnoreUnavailableNodes = []string{}
 	}
 	if c.Startup.StuckPodGraceSeconds <= 0 {
 		c.Startup.StuckPodGraceSeconds = 180
 	}
 	if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
 		c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
 	}
 	if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
 		c.Startup.VaultUnsealBreakglassTimeout = 15
 	}
 	if c.SSHPort <= 0 {
 		c.SSHPort = 2277
 	}
 	if c.Shutdown.DefaultBudgetSeconds <= 0 {
 		c.Shutdown.DefaultBudgetSeconds = 1380
 	}
 	if c.Shutdown.HistoryMinSamples <= 0 {
 		c.Shutdown.HistoryMinSamples = 3
 	}
 	if c.Shutdown.EmergencyBudgetSec <= 0 {
 		c.Shutdown.EmergencyBudgetSec = 420
 	}
 	if c.Shutdown.EmergencyMinSamples <= 0 {
 		c.Shutdown.EmergencyMinSamples = 3
 	}
 	if c.Shutdown.DrainParallelism <= 0 {
 		c.Shutdown.DrainParallelism = 6
 	}
 	if c.Shutdown.ScaleParallelism <= 0 {
 		c.Shutdown.ScaleParallelism = 8
 	}
 	if c.Shutdown.SSHParallelism <= 0 {
 		c.Shutdown.SSHParallelism = 8
 	}
 	if c.UPS.PollSeconds <= 0 {
 		c.UPS.PollSeconds = 5
 	}
 	if c.UPS.RuntimeSafetyFactor <= 0 {
 		c.UPS.RuntimeSafetyFactor = 1.25
 	}
 	if c.UPS.DebounceCount <= 0 {
 		c.UPS.DebounceCount = 3
 	}
 	if c.UPS.TelemetryTimeoutSeconds <= 0 {
 		c.UPS.TelemetryTimeoutSeconds = 90
 	}
 	if c.Coordination.ForwardShutdownConfig == "" {
 		c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
 	}
 	if c.Coordination.PeerHosts == nil {
 		c.Coordination.PeerHosts = []string{}
 	}
 	if c.Coordination.CommandTimeoutSeconds <= 0 {
 		c.Coordination.CommandTimeoutSeconds = 25
 	}
 	if c.Coordination.StartupGuardMaxAgeSec <= 0 {
 		c.Coordination.StartupGuardMaxAgeSec = 900
 	}
 	if c.Coordination.Role == "" {
 		c.Coordination.Role = "coordinator"
 	}
 	if c.Metrics.BindAddr == "" {
 		c.Metrics.BindAddr = "0.0.0.0:9560"
 	}
 	if c.Metrics.Path == "" {
 		c.Metrics.Path = "/metrics"
 	}
 	if c.State.Dir == "" {
 		c.State.Dir = "/var/lib/ananke"
 	}
 	if c.State.ReportsDir == "" {
 		c.State.ReportsDir = "/var/lib/ananke/reports"
 	}
 	if c.State.RunHistoryPath == "" {
 		c.State.RunHistoryPath = "/var/lib/ananke/runs.json"
 	}
 	if c.State.LockPath == "" {
 		c.State.LockPath = "/var/lib/ananke/ananke.lock"
 	}
 	if c.State.IntentPath == "" {
 		c.State.IntentPath = "/var/lib/ananke/intent.json"
 	}
 }
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@ -7,6 +7,9 @@ import (
 	"testing"
 )
 // TestLoadAcceptsUPSTargets runs one orchestration or CLI step.
 // Signature: TestLoadAcceptsUPSTargets(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestLoadAcceptsUPSTargets(t *testing.T) {
 	tmp := t.TempDir()
 	cfgPath := filepath.Join(tmp, "ananke.yaml")
@ -39,6 +42,9 @@ state:
 	}
 }
 // TestValidateForwardShutdownRequiresConfigPath runs one orchestration or CLI step.
 // Signature: TestValidateForwardShutdownRequiresConfigPath(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) {
 	cfg := defaults()
 	cfg.Coordination.ForwardShutdownHost = "titan-db"
@ -48,6 +54,9 @@ func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) {
 	}
 }
 // TestValidateRejectsUnknownRole runs one orchestration or CLI step.
 // Signature: TestValidateRejectsUnknownRole(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsUnknownRole(t *testing.T) {
 	cfg := defaults()
 	cfg.Coordination.Role = "unknown"
@ -56,6 +65,9 @@ func TestValidateRejectsUnknownRole(t *testing.T) {
 	}
 }
 // TestValidateRejectsEmptyPeerHostEntry runs one orchestration or CLI step.
 // Signature: TestValidateRejectsEmptyPeerHostEntry(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
 	cfg := defaults()
 	cfg.Coordination.PeerHosts = []string{"titan-24", " "}
@ -64,6 +76,9 @@ func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
 	}
 }
 // TestValidateRejectsUnknownEtcdRestoreControlPlane runs one orchestration or CLI step.
 // Signature: TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.EtcdRestoreControlPlane = "titan-missing"
@ -72,6 +87,9 @@ func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
 	}
 }
 // TestLoadSetsCoordinationGuardDefaults runs one orchestration or CLI step.
 // Signature: TestLoadSetsCoordinationGuardDefaults(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
 	tmp := t.TempDir()
 	cfgPath := filepath.Join(tmp, "ananke.yaml")
@ -114,6 +132,9 @@ state:
 	}
 }
 // TestValidateRejectsInvalidStartupShutdownCooldown runs one orchestration or CLI step.
 // Signature: TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.ShutdownCooldownSeconds = 0
@ -122,6 +143,9 @@ func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) {
 	}
 }
 // TestValidateRejectsInvalidTimeSyncMode runs one orchestration or CLI step.
 // Signature: TestValidateRejectsInvalidTimeSyncMode(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.TimeSyncMode = "invalid"
@ -130,6 +154,9 @@ func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
 	}
 }
 // TestValidateRejectsBadStoragePVCFormat runs one orchestration or CLI step.
 // Signature: TestValidateRejectsBadStoragePVCFormat(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsBadStoragePVCFormat(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.StorageCriticalPVCs = []string{"vault-data-vault-0"}
@ -138,6 +165,9 @@ func TestValidateRejectsBadStoragePVCFormat(t *testing.T) {
 	}
 }
 // TestValidateRejectsMissingPostStartProbesWhenRequired runs one orchestration or CLI step.
 // Signature: TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.RequirePostStartProbes = true
@ -147,6 +177,9 @@ func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
 	}
 }
 // TestValidateRejectsMissingServiceChecklistWhenRequired runs one orchestration or CLI step.
 // Signature: TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.RequireServiceChecklist = true
@ -156,6 +189,9 @@ func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
 	}
 }
 // TestValidateRejectsBadServiceChecklistURL runs one orchestration or CLI step.
 // Signature: TestValidateRejectsBadServiceChecklistURL(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
@ -171,6 +207,9 @@ func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
 	}
 }
 // TestValidateRejectsBadIgnoreFluxKustomizationFormat runs one orchestration or CLI step.
 // Signature: TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.IgnoreFluxKustomizations = []string{"jellyfin"}
@ -179,6 +218,9 @@ func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
 	}
 }
 // TestValidateRejectsBadIgnoreWorkloadFormat runs one orchestration or CLI step.
 // Signature: TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.IgnoreWorkloads = []string{"maintenance/metis/extra/value"}
@ -187,6 +229,9 @@ func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
 	}
 }
 // TestValidateRejectsInvalidRequiredNodeLabel runs one orchestration or CLI step.
 // Signature: TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
@ -198,3 +243,85 @@ func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
 		t.Fatalf("expected validation error for invalid required_node_labels entry")
 	}
 }
 // TestValidateRejectsInvalidNodeInventoryReachWindow runs one orchestration or CLI step.
 // Signature: TestValidateRejectsInvalidNodeInventoryReachWindow(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsInvalidNodeInventoryReachWindow(t *testing.T) {
 	cfg := defaults()
 	cfg.Startup.NodeInventoryReachWaitSeconds = 0
 	if err := cfg.Validate(); err == nil {
 		t.Fatalf("expected validation error for invalid node_inventory_reachability_wait_seconds")
 	}
 }
 // TestValidateRejectsMissingReportsDir runs one orchestration or CLI step.
 // Signature: TestValidateRejectsMissingReportsDir(t *testing.T).
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func TestValidateRejectsMissingReportsDir(t *testing.T) {
 	cfg := defaults()
 	cfg.State.ReportsDir = ""
 	if err := cfg.Validate(); err == nil {
 		t.Fatalf("expected validation error for missing state.reports_dir")
 	}
 }
 // TestApplyDefaultsMergesServiceChecklistDefaults runs one orchestration or CLI step.
 // Signature: TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T).
 // Why: host configs may define a partial checklist; startup still needs the
 // baseline service validations learned from drills.
 func TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T) {
 	cfg := Config{
 		Startup: Startup{
 			ServiceChecklist: []ServiceChecklistCheck{
 				{
 					Name:           "custom-smoke",
 					URL:            "https://example.invalid/healthz",
 					TimeoutSeconds: 7,
 				},
 			},
 		},
 	}
 	cfg.applyDefaults()
 	names := map[string]struct{}{}
 	for _, check := range cfg.Startup.ServiceChecklist {
 		names[check.Name] = struct{}{}
 	}
 	if _, ok := names["custom-smoke"]; !ok {
 		t.Fatalf("expected custom checklist entry to be preserved")
 	}
 	if _, ok := names["logging-oidc-redirect"]; !ok {
 		t.Fatalf("expected default logging redirect check to be merged in")
 	}
 	if _, ok := names["vaultwarden-ui"]; !ok {
 		t.Fatalf("expected default vaultwarden check to be merged in")
 	}
 }
 // TestApplyDefaultsMergesCriticalServiceEndpointDefaults runs one orchestration or CLI step.
 // Signature: TestApplyDefaultsMergesCriticalServiceEndpointDefaults(t *testing.T).
 // Why: startup endpoint gating must keep baseline backend checks even when host
 // configs only provide a subset.
 func TestApplyDefaultsMergesCriticalServiceEndpointDefaults(t *testing.T) {
 	cfg := Config{
 		Startup: Startup{
 			CriticalServiceEndpoints: []string{"customns/customsvc"},
 		},
 	}
 	cfg.applyDefaults()
 	seen := map[string]struct{}{}
 	for _, entry := range cfg.Startup.CriticalServiceEndpoints {
 		seen[entry] = struct{}{}
 	}
 	if _, ok := seen["customns/customsvc"]; !ok {
 		t.Fatalf("expected custom critical endpoint to be preserved")
 	}
 	if _, ok := seen["logging/opensearch-dashboards"]; !ok {
 		t.Fatalf("expected logging/opensearch-dashboards critical endpoint default")
 	}
 	if _, ok := seen["monitoring/victoria-metrics-single-server"]; !ok {
 		t.Fatalf("expected monitoring/victoria-metrics-single-server critical endpoint default")
 	}
 }
--- a/internal/config/defaults.go
+++ b/internal/config/defaults.go
@ -0,0 +1,155 @@
 package config
 // defaults runs one orchestration or CLI step.
 // Signature: defaults() Config.
 // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
 func defaults() Config {
 	c := Config{
 		IACRepoPath:        "/opt/titan-iac",
 		ExpectedFluxBranch: "main",
 		ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
 		SSHPort:            2277,
 		ControlPlanes:      []string{"titan-0a", "titan-0b", "titan-0c"},
 		LocalBootstrapPaths: []string{
 			"infrastructure/core",
 			"clusters/atlas/flux-system",
 			"infrastructure/sources/helm",
 			"infrastructure/metallb",
 			"infrastructure/traefik",
 			"infrastructure/cert-manager",
 			"infrastructure/vault-csi",
 			"infrastructure/vault-injector",
 			"services/vault",
 			"infrastructure/postgres",
 			"services/gitea",
 			"services/keycloak",
 			"services/oauth2-proxy",
 		},
 		ExcludedNamespaces: []string{
 			"kube-system",
 			"kube-public",
 			"kube-node-lease",
 			"flux-system",
 			"traefik",
 			"metallb-system",
 			"cert-manager",
 			"longhorn-system",
 			"vault",
 			"postgres",
 			"maintenance",
 		},
 		Startup: Startup{
 			APIWaitSeconds:                1200,
 			APIPollSeconds:                2,
 			ShutdownCooldownSeconds:       45,
 			RequireNodeInventoryReach:     true,
 			NodeInventoryReachWaitSeconds: 300,
 			NodeInventoryReachPollSeconds: 5,
 			RequireTimeSync:               true,
 			TimeSyncWaitSeconds:           240,
 			TimeSyncPollSeconds:           5,
 			TimeSyncMode:                  "quorum",
 			TimeSyncQuorum:                2,
 			ReconcileAccessOnBoot:         true,
 			AutoEtcdRestoreOnAPIFailure:   true,
 			EtcdRestoreControlPlane:       "titan-0a",
 			RequireStorageReady:           true,
 			StorageReadyWaitSeconds:       420,
 			StorageReadyPollSeconds:       5,
 			StorageMinReadyNodes:          2,
 			StorageCriticalPVCs: []string{
 				"vault/data-vault-0",
 				"postgres/postgres-data-postgres-0",
 				"gitea/gitea-data",
 				"sso/keycloak-data",
 			},
 			MinimumBatteryPercent: 20,
 			RequiredNodeLabels: map[string]map[string]string{
 				"titan-09": {
 					"ananke.bstein.dev/harbor-bootstrap": "true",
 				},
 			},
 			RequirePostStartProbes:    true,
 			PostStartProbeWaitSeconds: 240,
 			PostStartProbePollSeconds: 5,
 			PostStartProbes: []string{
 				"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
 				"https://scm.bstein.dev/api/healthz",
 				"https://metrics.bstein.dev/api/health",
 			},
 			RequireServiceChecklist:         true,
 			ServiceChecklistWaitSeconds:     420,
 			ServiceChecklistPollSeconds:     5,
 			ServiceChecklistStabilitySec:    120,
 			ServiceChecklist:                defaultServiceChecklist(),
 			RequireCriticalServiceEndpoints: true,
 			CriticalServiceEndpointWaitSec:  420,
 			CriticalServiceEndpointPollSec:  5,
 			CriticalServiceEndpoints:        defaultCriticalServiceEndpoints(),
 			RequireIngressChecklist:         true,
 			IngressChecklistWaitSeconds:     420,
 			IngressChecklistPollSeconds:     5,
 			IngressChecklistAccepted:        []int{200, 301, 302, 307, 308, 401, 403, 404},
 			IngressChecklistIgnoreHosts:     []string{},
 			RequireNodeSSHAuth:              true,
 			NodeSSHAuthWaitSeconds:          240,
 			NodeSSHAuthPollSeconds:          5,
 			RequireFluxHealth:               true,
 			FluxHealthWaitSeconds:           900,
 			FluxHealthPollSeconds:           5,
 			IgnoreFluxKustomizations:        []string{},
 			RequireWorkloadConvergence:      true,
 			WorkloadConvergenceWaitSeconds:  900,
 			WorkloadConvergencePollSeconds:  5,
 			IgnoreWorkloadNamespaces:        []string{},
 			IgnoreWorkloads:                 []string{},
 			IgnoreUnavailableNodes:          []string{},
 			AutoRecycleStuckPods:            true,
 			StuckPodGraceSeconds:            180,
 			VaultUnsealKeyFile:              "/var/lib/ananke/vault-unseal.key",
 			VaultUnsealBreakglassTimeout:    15,
 		},
 		Shutdown: Shutdown{
 			DefaultBudgetSeconds: 1380,
 			HistoryMinSamples:    3,
 			EmergencyBudgetSec:   420,
 			EmergencyMinSamples:  3,
 			EmergencySkipEtcd:    true,
 			EmergencySkipDrain:   true,
 			DrainParallelism:     6,
 			ScaleParallelism:     8,
 			SSHParallelism:       8,
 		},
 		UPS: UPS{
 			Enabled:                 true,
 			Provider:                "nut",
 			PollSeconds:             5,
 			RuntimeSafetyFactor:     1.25,
 			DebounceCount:           3,
 			TelemetryTimeoutSeconds: 90,
 		},
 		Coordination: Coordination{
 			ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
 			PeerHosts:             []string{},
 			FallbackLocalShutdown: true,
 			CommandTimeoutSeconds: 25,
 			StartupGuardMaxAgeSec: 900,
 			Role:                  "coordinator",
 			AllowStartupOnBattery: false,
 		},
 		Metrics: Metrics{
 			Enabled:  true,
 			BindAddr: "0.0.0.0:9560",
 			Path:     "/metrics",
 		},
 		State: State{
 			Dir:            "/var/lib/ananke",
 			ReportsDir:     "/var/lib/ananke/reports",
 			RunHistoryPath: "/var/lib/ananke/runs.json",
 			LockPath:       "/var/lib/ananke/ananke.lock",
 			IntentPath:     "/var/lib/ananke/intent.json",
 		},
 	}
 	c.applyDefaults()
 	return c
 }
--- a/internal/config/startup_service_catalog.go
+++ b/internal/config/startup_service_catalog.go
@ -0,0 +1,315 @@
 package config
 import "strings"
 // defaultServiceChecklist runs one orchestration or CLI step.
 // Signature: defaultServiceChecklist() []ServiceChecklistCheck.
 // Why: startup must verify real external behavior per service (not only generic
 // ingress reachability) so false positives do not pass drills.
 func defaultServiceChecklist() []ServiceChecklistCheck {
 	return []ServiceChecklistCheck{
 		{
 			Name:             "gitea-api",
 			URL:              "https://scm.bstein.dev/api/healthz",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "pass",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "grafana-api",
 			URL:              "https://metrics.bstein.dev/api/health",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "\"database\":\"ok\"",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "keycloak-oidc",
 			URL:              "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "\"issuer\":\"https://sso.bstein.dev/realms/atlas\"",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "harbor-registry-api",
 			URL:              "https://registry.bstein.dev/v2/",
 			AcceptedStatuses: []int{401},
 			BodyContains:     "unauthorized",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "alerts-ui",
 			URL:              "https://alerts.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "Alertmanager",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "auth-gateway-redirect",
 			URL:              "https://auth.bstein.dev/",
 			AcceptedStatuses: []int{302},
 			LocationContains: "https://sso.bstein.dev/realms/atlas/",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "home-site",
 			URL:              "https://bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "Titan Lab",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "actual-budget-ui",
 			URL:              "https://budget.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "<title>Actual",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "element-call-ui",
 			URL:              "https://call.live.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "Element Call",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "flux-gitops-ui",
 			URL:              "https://cd.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "Weave GitOps",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "chat-ai-health",
 			URL:              "https://chat.ai.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "\"ok\": true",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "jenkins-auth-gate",
 			URL:              "https://ci.bstein.dev/",
 			AcceptedStatuses: []int{403},
 			BodyContains:     "commenceLogin",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "nextcloud-login-redirect",
 			URL:              "https://cloud.bstein.dev/",
 			AcceptedStatuses: []int{302},
 			LocationContains: "/index.php/login",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "wger-redirect",
 			URL:              "https://health.bstein.dev/",
 			AcceptedStatuses: []int{302},
 			LocationContains: "/en/",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "livekit-edge",
 			URL:              "https://kit.live.bstein.dev/",
 			AcceptedStatuses: []int{404},
 			BodyContains:     "404 page not found",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "element-web-ui",
 			URL:              "https://live.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "<title>Element</title>",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "logging-oidc-redirect",
 			URL:              "https://logs.bstein.dev/",
 			AcceptedStatuses: []int{302},
 			LocationContains: "client_id=logs",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "longhorn-oidc-redirect",
 			URL:              "https://longhorn.bstein.dev/",
 			AcceptedStatuses: []int{302},
 			LocationContains: "https://sso.bstein.dev/realms/atlas/",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "matrix-auth-ui",
 			URL:              "https://matrix.live.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "matrix-authentication-service",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "monero-edge",
 			URL:              "https://monero.bstein.dev/",
 			AcceptedStatuses: []int{404},
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "firefly-login-redirect",
 			URL:              "https://money.bstein.dev/",
 			AcceptedStatuses: []int{302},
 			LocationContains: "/login",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "outline-ui",
 			URL:              "https://notes.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "<title>Outline</title>",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "collabora-probe",
 			URL:              "https://office.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "OK",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "pegasus-ui",
 			URL:              "https://pegasus.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "<title>Pegasus</title>",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "harbor-ui",
 			URL:              "https://registry.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "<title>Harbor</title>",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "vault-ui-redirect",
 			URL:              "https://secret.bstein.dev/",
 			AcceptedStatuses: []int{307},
 			LocationContains: "/ui/",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "sentinel-oidc-redirect",
 			URL:              "https://sentinel.bstein.dev/",
 			AcceptedStatuses: []int{302},
 			LocationContains: "client_id=metis",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "keycloak-admin-redirect",
 			URL:              "https://sso.bstein.dev/",
 			AcceptedStatuses: []int{302},
 			LocationContains: "https://sso.bstein.dev/admin/",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "jellyfin-edge",
 			URL:              "https://stream.bstein.dev/",
 			AcceptedStatuses: []int{302},
 			LocationContains: "web/",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "planka-ui",
 			URL:              "https://tasks.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "<title>PLANKA</title>",
 			TimeoutSeconds:   12,
 		},
 		{
 			Name:             "vaultwarden-ui",
 			URL:              "https://vault.bstein.dev/",
 			AcceptedStatuses: []int{200},
 			BodyContains:     "<title>Vaultwarden Web</title>",
 			TimeoutSeconds:   12,
 		},
 	}
 }
 // defaultCriticalServiceEndpoints runs one orchestration or CLI step.
 // Signature: defaultCriticalServiceEndpoints() []string.
 // Why: service edge checks are insufficient for protected stacks; endpoint
 // presence verifies that backends are actually routable before startup success.
 func defaultCriticalServiceEndpoints() []string {
 	return []string{
 		"monitoring/victoria-metrics-single-server",
 		"monitoring/grafana",
 		"monitoring/kube-state-metrics",
 		"logging/oauth2-proxy-logs",
 		"logging/opensearch-dashboards",
 		"logging/opensearch-master",
 	}
 }
 // mergeServiceChecklistDefaults runs one orchestration or CLI step.
 // Signature: mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck.
 // Why: host configs can keep custom checks while still inheriting mandatory
 // baseline checks introduced after incident learnings.
 func mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck {
 	if len(existing) == 0 {
 		out := make([]ServiceChecklistCheck, 0, len(defaults))
 		out = append(out, defaults...)
 		return out
 	}
 	byName := map[string]struct{}{}
 	for _, check := range existing {
 		name := strings.TrimSpace(check.Name)
 		if name == "" {
 			continue
 		}
 		byName[name] = struct{}{}
 	}
 	out := make([]ServiceChecklistCheck, 0, len(existing)+len(defaults))
 	out = append(out, existing...)
 	for _, check := range defaults {
 		name := strings.TrimSpace(check.Name)
 		if name == "" {
 			continue
 		}
 		if _, exists := byName[name]; exists {
 			continue
 		}
 		out = append(out, check)
 	}
 	return out
 }
 // mergeStringDefaults runs one orchestration or CLI step.
 // Signature: mergeStringDefaults(existing, defaults []string) []string.
 // Why: keeps baseline startup guards applied while preserving site-specific
 // additions already declared in host configs.
 func mergeStringDefaults(existing, defaults []string) []string {
 	if len(existing) == 0 {
 		out := make([]string, 0, len(defaults))
 		out = append(out, defaults...)
 		return out
 	}
 	seen := map[string]struct{}{}
 	out := make([]string, 0, len(existing)+len(defaults))
 	for _, item := range existing {
 		key := strings.TrimSpace(item)
 		if key == "" {
 			continue
 		}
 		if _, ok := seen[key]; ok {
 			continue
 		}
 		seen[key] = struct{}{}
 		out = append(out, key)
 	}
 	for _, item := range defaults {
 		key := strings.TrimSpace(item)
 		if key == "" {
 			continue
 		}
 		if _, ok := seen[key]; ok {
 			continue
 		}
 		seen[key] = struct{}{}
 		out = append(out, key)
 	}
 	return out
 }
--- a/internal/config/types.go
+++ b/internal/config/types.go
@ -0,0 +1,156 @@
 package config
 type Config struct {
 	Kubeconfig          string            `yaml:"kubeconfig"`
 	SSHUser             string            `yaml:"ssh_user"`
 	SSHPort             int               `yaml:"ssh_port"`
 	SSHConfigFile       string            `yaml:"ssh_config_file"`
 	SSHIdentityFile     string            `yaml:"ssh_identity_file"`
 	SSHNodeHosts        map[string]string `yaml:"ssh_node_hosts"`
 	SSHNodeUsers        map[string]string `yaml:"ssh_node_users"`
 	SSHManagedNodes     []string          `yaml:"ssh_managed_nodes"`
 	SSHJumpHost         string            `yaml:"ssh_jump_host"`
 	SSHJumpUser         string            `yaml:"ssh_jump_user"`
 	IACRepoPath         string            `yaml:"iac_repo_path"`
 	ExpectedFluxBranch  string            `yaml:"expected_flux_branch"`
 	ExpectedFluxSource  string            `yaml:"expected_flux_source_url"`
 	ControlPlanes       []string          `yaml:"control_planes"`
 	Workers             []string          `yaml:"workers"`
 	LocalBootstrapPaths []string          `yaml:"local_bootstrap_paths"`
 	ExcludedNamespaces  []string          `yaml:"excluded_namespaces"`
 	Startup             Startup           `yaml:"startup"`
 	Shutdown            Shutdown          `yaml:"shutdown"`
 	UPS                 UPS               `yaml:"ups"`
 	Coordination        Coordination      `yaml:"coordination"`
 	Metrics             Metrics           `yaml:"metrics"`
 	State               State             `yaml:"state"`
 }
 type Startup struct {
 	APIWaitSeconds                  int                          `yaml:"api_wait_seconds"`
 	APIPollSeconds                  int                          `yaml:"api_poll_seconds"`
 	ShutdownCooldownSeconds         int                          `yaml:"shutdown_cooldown_seconds"`
 	MinimumBatteryPercent           float64                      `yaml:"minimum_battery_percent"`
 	RequireNodeInventoryReach       bool                         `yaml:"require_node_inventory_reachability"`
 	NodeInventoryReachWaitSeconds   int                          `yaml:"node_inventory_reachability_wait_seconds"`
 	NodeInventoryReachPollSeconds   int                          `yaml:"node_inventory_reachability_poll_seconds"`
 	RequiredNodeLabels              map[string]map[string]string `yaml:"required_node_labels"`
 	RequireTimeSync                 bool                         `yaml:"require_time_sync"`
 	TimeSyncWaitSeconds             int                          `yaml:"time_sync_wait_seconds"`
 	TimeSyncPollSeconds             int                          `yaml:"time_sync_poll_seconds"`
 	TimeSyncMode                    string                       `yaml:"time_sync_mode"`
 	TimeSyncQuorum                  int                          `yaml:"time_sync_quorum"`
 	ReconcileAccessOnBoot           bool                         `yaml:"reconcile_access_on_boot"`
 	AutoEtcdRestoreOnAPIFailure     bool                         `yaml:"auto_etcd_restore_on_api_failure"`
 	EtcdRestoreControlPlane         string                       `yaml:"etcd_restore_control_plane"`
 	RequireStorageReady             bool                         `yaml:"require_storage_ready"`
 	StorageReadyWaitSeconds         int                          `yaml:"storage_ready_wait_seconds"`
 	StorageReadyPollSeconds         int                          `yaml:"storage_ready_poll_seconds"`
 	StorageMinReadyNodes            int                          `yaml:"storage_min_ready_nodes"`
 	StorageCriticalPVCs             []string                     `yaml:"storage_critical_pvcs"`
 	RequirePostStartProbes          bool                         `yaml:"require_post_start_probes"`
 	PostStartProbeWaitSeconds       int                          `yaml:"post_start_probe_wait_seconds"`
 	PostStartProbePollSeconds       int                          `yaml:"post_start_probe_poll_seconds"`
 	PostStartProbes                 []string                     `yaml:"post_start_probes"`
 	RequireServiceChecklist         bool                         `yaml:"require_service_checklist"`
 	ServiceChecklistWaitSeconds     int                          `yaml:"service_checklist_wait_seconds"`
 	ServiceChecklistPollSeconds     int                          `yaml:"service_checklist_poll_seconds"`
 	ServiceChecklistStabilitySec    int                          `yaml:"service_checklist_stability_seconds"`
 	ServiceChecklist                []ServiceChecklistCheck      `yaml:"service_checklist"`
 	RequireCriticalServiceEndpoints bool                         `yaml:"require_critical_service_endpoints"`
 	CriticalServiceEndpointWaitSec  int                          `yaml:"critical_service_endpoint_wait_seconds"`
 	CriticalServiceEndpointPollSec  int                          `yaml:"critical_service_endpoint_poll_seconds"`
 	CriticalServiceEndpoints        []string                     `yaml:"critical_service_endpoints"`
 	RequireIngressChecklist         bool                         `yaml:"require_ingress_checklist"`
 	IngressChecklistWaitSeconds     int                          `yaml:"ingress_checklist_wait_seconds"`
 	IngressChecklistPollSeconds     int                          `yaml:"ingress_checklist_poll_seconds"`
 	IngressChecklistAccepted        []int                        `yaml:"ingress_checklist_accepted_statuses"`
 	IngressChecklistIgnoreHosts     []string                     `yaml:"ingress_checklist_ignore_hosts"`
 	IngressChecklistInsecureSkip    bool                         `yaml:"ingress_checklist_insecure_skip_tls"`
 	RequireNodeSSHAuth              bool                         `yaml:"require_node_ssh_auth"`
 	NodeSSHAuthWaitSeconds          int                          `yaml:"node_ssh_auth_wait_seconds"`
 	NodeSSHAuthPollSeconds          int                          `yaml:"node_ssh_auth_poll_seconds"`
 	RequireFluxHealth               bool                         `yaml:"require_flux_health"`
 	FluxHealthWaitSeconds           int                          `yaml:"flux_health_wait_seconds"`
 	FluxHealthPollSeconds           int                          `yaml:"flux_health_poll_seconds"`
 	IgnoreFluxKustomizations        []string                     `yaml:"ignore_flux_kustomizations"`
 	RequireWorkloadConvergence      bool                         `yaml:"require_workload_convergence"`
 	WorkloadConvergenceWaitSeconds  int                          `yaml:"workload_convergence_wait_seconds"`
 	WorkloadConvergencePollSeconds  int                          `yaml:"workload_convergence_poll_seconds"`
 	IgnoreWorkloadNamespaces        []string                     `yaml:"ignore_workload_namespaces"`
 	IgnoreWorkloads                 []string                     `yaml:"ignore_workloads"`
 	IgnoreUnavailableNodes          []string                     `yaml:"ignore_unavailable_nodes"`
 	AutoRecycleStuckPods            bool                         `yaml:"auto_recycle_stuck_pods"`
 	StuckPodGraceSeconds            int                          `yaml:"stuck_pod_grace_seconds"`
 	VaultUnsealKeyFile              string                       `yaml:"vault_unseal_key_file"`
 	VaultUnsealBreakglassCommand    string                       `yaml:"vault_unseal_breakglass_command"`
 	VaultUnsealBreakglassTimeout    int                          `yaml:"vault_unseal_breakglass_timeout_seconds"`
 }
 type ServiceChecklistCheck struct {
 	Name                string `yaml:"name"`
 	URL                 string `yaml:"url"`
 	AcceptedStatuses    []int  `yaml:"accepted_statuses"`
 	LocationContains    string `yaml:"location_contains"`
 	LocationNotContains string `yaml:"location_not_contains"`
 	BodyContains        string `yaml:"body_contains"`
 	BodyNotContains     string `yaml:"body_not_contains"`
 	TimeoutSeconds      int    `yaml:"timeout_seconds"`
 	InsecureSkipTLS     bool   `yaml:"insecure_skip_tls"`
 }
 type Shutdown struct {
 	DefaultBudgetSeconds int  `yaml:"default_budget_seconds"`
 	HistoryMinSamples    int  `yaml:"history_min_samples"`
 	EmergencyBudgetSec   int  `yaml:"emergency_budget_seconds"`
 	EmergencyMinSamples  int  `yaml:"emergency_history_min_samples"`
 	EmergencySkipEtcd    bool `yaml:"emergency_skip_etcd_snapshot"`
 	EmergencySkipDrain   bool `yaml:"emergency_skip_drain"`
 	SkipEtcdSnapshot     bool `yaml:"skip_etcd_snapshot"`
 	SkipDrain            bool `yaml:"skip_drain"`
 	DrainParallelism     int  `yaml:"drain_parallelism"`
 	ScaleParallelism     int  `yaml:"scale_parallelism"`
 	SSHParallelism       int  `yaml:"ssh_parallelism"`
 }
 type UPS struct {
 	Enabled                 bool        `yaml:"enabled"`
 	Provider                string      `yaml:"provider"`
 	Target                  string      `yaml:"target"`
 	Targets                 []UPSTarget `yaml:"targets"`
 	PollSeconds             int         `yaml:"poll_seconds"`
 	RuntimeSafetyFactor     float64     `yaml:"runtime_safety_factor"`
 	DebounceCount           int         `yaml:"debounce_count"`
 	TelemetryTimeoutSeconds int         `yaml:"telemetry_timeout_seconds"`
 }
 type UPSTarget struct {
 	Name   string `yaml:"name"`
 	Target string `yaml:"target"`
 }
 type Coordination struct {
 	ForwardShutdownHost   string   `yaml:"forward_shutdown_host"`
 	ForwardShutdownUser   string   `yaml:"forward_shutdown_user"`
 	ForwardShutdownConfig string   `yaml:"forward_shutdown_config"`
 	PeerHosts             []string `yaml:"peer_hosts"`
 	FallbackLocalShutdown bool     `yaml:"fallback_local_shutdown"`
 	CommandTimeoutSeconds int      `yaml:"command_timeout_seconds"`
 	StartupGuardMaxAgeSec int      `yaml:"startup_guard_max_age_seconds"`
 	Role                  string   `yaml:"role"`
 	AllowStartupOnBattery bool     `yaml:"allow_startup_on_battery"`
 }
 type Metrics struct {
 	Enabled  bool   `yaml:"enabled"`
 	BindAddr string `yaml:"bind_addr"`
 	Path     string `yaml:"path"`
 }
 type State struct {
 	Dir            string `yaml:"dir"`
 	ReportsDir     string `yaml:"reports_dir"`
 	RunHistoryPath string `yaml:"run_history_path"`
 	LockPath       string `yaml:"lock_path"`
 	IntentPath     string `yaml:"intent_path"`
 }