From 95fefba244ab4caa9f7fa53c038cfa655e7aa358 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 8 Apr 2026 23:42:09 -0300 Subject: [PATCH] startup: enforce external service behavior checks --- internal/cluster/orchestrator_core.go | 124 ++++++ .../cluster/orchestrator_service_stability.go | 389 ++++++++++++++++++ internal/cluster/orchestrator_test.go | 180 +++++--- internal/config/apply_defaults.go | 236 +++++++++++ internal/config/config_test.go | 127 ++++++ internal/config/defaults.go | 155 +++++++ internal/config/startup_service_catalog.go | 315 ++++++++++++++ internal/config/types.go | 156 +++++++ 8 files changed, 1615 insertions(+), 67 deletions(-) create mode 100644 internal/cluster/orchestrator_core.go create mode 100644 internal/cluster/orchestrator_service_stability.go create mode 100644 internal/config/apply_defaults.go create mode 100644 internal/config/defaults.go create mode 100644 internal/config/startup_service_catalog.go create mode 100644 internal/config/types.go diff --git a/internal/cluster/orchestrator_core.go b/internal/cluster/orchestrator_core.go new file mode 100644 index 0000000..667397f --- /dev/null +++ b/internal/cluster/orchestrator_core.go @@ -0,0 +1,124 @@ +package cluster + +import ( + "context" + "errors" + "log" + "regexp" + "sync" + "time" + + "scm.bstein.dev/bstein/ananke/internal/config" + "scm.bstein.dev/bstein/ananke/internal/execx" + "scm.bstein.dev/bstein/ananke/internal/state" +) + +type Orchestrator struct { + cfg config.Config + runner *execx.Runner + store *state.Store + log *log.Logger + runOverride func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error) + runSensitiveOverride func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error) + startupReportMu sync.Mutex + activeStartupReport *startupReport +} + +type commandOverrideFunc func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error) + +type StartupOptions struct { + ForceFluxBranch string + SkipLocalBootstrap bool + Reason string +} + +type ShutdownOptions struct { + SkipEtcdSnapshot bool + SkipDrain bool + Mode string + Reason string +} + +type EtcdRestoreOptions struct { + ControlPlane string + SnapshotPath string +} + +type startupWorkload struct { + Namespace string + Kind string + Name string +} + +type workloadScaleEntry struct { + Namespace string `json:"namespace"` + Kind string `json:"kind"` + Name string `json:"name"` + Replicas int `json:"replicas"` +} + +type remotePeerStatus struct { + Intent state.Intent + BootstrapActive bool +} + +type workloadScaleSnapshot struct { + GeneratedAt time.Time `json:"generated_at"` + Entries []workloadScaleEntry `json:"entries"` +} + +type startupReport struct { + StartedAt time.Time `json:"started_at"` + Completed time.Time `json:"completed_at"` + Reason string `json:"reason"` + Status string `json:"status"` + Phase string `json:"phase"` + Success bool `json:"success"` + Error string `json:"error,omitempty"` + Checks map[string]startupCheckRecord `json:"checks"` + AutoHeals []string `json:"auto_heals"` + SourceHost string `json:"source_host"` + LastUpdated time.Time `json:"last_updated"` +} + +type startupCheckRecord struct { + Status string `json:"status"` + Detail string `json:"detail"` + UpdatedAt time.Time `json:"updated_at"` +} + +var datastoreEndpointPattern = regexp.MustCompile(`--datastore-endpoint(?:=|\s+)(?:'([^']+)'|"([^"]+)"|([^\s\\]+))`) + +var criticalStartupWorkloads = []startupWorkload{ + {Namespace: "flux-system", Kind: "deployment", Name: "source-controller"}, + {Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"}, + {Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"}, + {Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"}, + {Namespace: "vault", Kind: "statefulset", Name: "vault"}, + {Namespace: "postgres", Kind: "statefulset", Name: "postgres"}, + {Namespace: "gitea", Kind: "deployment", Name: "gitea"}, + {Namespace: "monitoring", Kind: "deployment", Name: "grafana"}, + {Namespace: "monitoring", Kind: "statefulset", Name: "victoria-metrics-single-server"}, + {Namespace: "monitoring", Kind: "deployment", Name: "kube-state-metrics"}, + {Namespace: "logging", Kind: "deployment", Name: "oauth2-proxy-logs"}, + {Namespace: "logging", Kind: "deployment", Name: "opensearch-dashboards"}, + {Namespace: "logging", Kind: "statefulset", Name: "opensearch"}, +} + +var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable") + +// New runs one orchestration or CLI step. +// Signature: New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator. +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator { + return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger} +} + +// SetCommandOverrides runs one orchestration or CLI step. +// Signature: (o *Orchestrator) SetCommandOverrides(run commandOverrideFunc, runSensitive commandOverrideFunc). +// Why: enables deterministic integration testing from the top-level testing module +// without requiring package-local test files or live cluster dependencies. +func (o *Orchestrator) SetCommandOverrides(run commandOverrideFunc, runSensitive commandOverrideFunc) { + o.runOverride = run + o.runSensitiveOverride = runSensitive +} diff --git a/internal/cluster/orchestrator_service_stability.go b/internal/cluster/orchestrator_service_stability.go new file mode 100644 index 0000000..3f8a9c1 --- /dev/null +++ b/internal/cluster/orchestrator_service_stability.go @@ -0,0 +1,389 @@ +package cluster + +import ( + "context" + "crypto/tls" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" + "unicode" + + "scm.bstein.dev/bstein/ananke/internal/config" +) + +// isLikelyHostname runs one orchestration or CLI step. +// Signature: isLikelyHostname(value string) bool. +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func isLikelyHostname(value string) bool { + value = strings.TrimSpace(value) + if value == "" { + return false + } + if strings.Contains(value, " ") || strings.Contains(value, "/") { + return false + } + return strings.Contains(value, ".") +} + +// healIngressHostBackendReplicas runs one orchestration or CLI step. +// Signature: (o *Orchestrator) healIngressHostBackendReplicas(ctx context.Context, host string) ([]string, error). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func (o *Orchestrator) healIngressHostBackendReplicas(ctx context.Context, host string) ([]string, error) { + namespaces, err := o.discoverIngressNamespacesForHost(ctx, host) + if err != nil { + return nil, err + } + if len(namespaces) == 0 { + return nil, nil + } + targetNamespaces := makeStringSet(namespaces) + out, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json") + if err != nil { + return nil, fmt.Errorf("query workloads: %w", err) + } + var list workloadList + if err := json.Unmarshal([]byte(out), &list); err != nil { + return nil, fmt.Errorf("decode workloads: %w", err) + } + healed := []string{} + for _, item := range list.Items { + kind := strings.ToLower(strings.TrimSpace(item.Kind)) + ns := strings.TrimSpace(item.Metadata.Namespace) + name := strings.TrimSpace(item.Metadata.Name) + if kind == "" || ns == "" || name == "" { + continue + } + if kind != "deployment" && kind != "statefulset" { + continue + } + if _, ok := targetNamespaces[ns]; !ok { + continue + } + desired := int32(1) + if item.Spec.Replicas != nil { + desired = *item.Spec.Replicas + } + if desired >= 1 { + continue + } + workload := startupWorkload{Namespace: ns, Kind: kind, Name: name} + if err := o.ensureWorkloadReplicas(ctx, workload, 1); err != nil { + if isNotFoundErr(err) { + continue + } + return healed, fmt.Errorf("scale %s/%s/%s to 1: %w", ns, kind, name, err) + } + healed = append(healed, ns+"/"+kind+"/"+name) + } + return healed, nil +} + +// waitForServiceChecklist runs one orchestration or CLI step. +// Signature: (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error. +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error { + wait := time.Duration(o.cfg.Startup.ServiceChecklistWaitSeconds) * time.Second + if wait <= 0 { + wait = 7 * time.Minute + } + poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second + if poll <= 0 { + poll = 5 * time.Second + } + deadline := time.Now().Add(wait) + lastFailure := "unknown" + lastLogged := time.Time{} + lastRecycleAttempt := time.Time{} + lastReplicaHeal := time.Time{} + lastIngressHeal := time.Time{} + for { + o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) + o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal) + prevFailure := lastFailure + ready, detail := o.serviceChecklistReady(ctx) + lastFailure = detail + if ready { + o.log.Printf("external service checklist passed (%s)", detail) + return nil + } + o.maybeAutoHealIngressHostBackends(ctx, &lastIngressHeal, lastFailure) + if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second { + remaining := time.Until(deadline).Round(time.Second) + if remaining < 0 { + remaining = 0 + } + o.log.Printf("waiting for external service checklist (%s remaining): %s", remaining, lastFailure) + lastLogged = time.Now() + } + if time.Now().After(deadline) { + return fmt.Errorf("startup blocked: external service checklist not satisfied within %s (%s)", wait, lastFailure) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(poll): + } + } +} + +// serviceChecklistReady runs one orchestration or CLI step. +// Signature: (o *Orchestrator) serviceChecklistReady(ctx context.Context) (bool, string). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func (o *Orchestrator) serviceChecklistReady(ctx context.Context) (bool, string) { + checks := o.cfg.Startup.ServiceChecklist + if len(checks) == 0 { + return true, "no checklist items configured" + } + for _, check := range checks { + ok, detail := o.serviceCheckReady(ctx, check) + if !ok { + name := strings.TrimSpace(check.Name) + if name == "" { + name = strings.TrimSpace(check.URL) + } + return false, fmt.Sprintf("%s: %s", name, detail) + } + } + return true, fmt.Sprintf("checks=%d", len(checks)) +} + +// serviceCheckReady runs one orchestration or CLI step. +// Signature: (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.ServiceChecklistCheck) (bool, string). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.ServiceChecklistCheck) (bool, string) { + result, err := o.httpChecklistProbeResult(ctx, check) + if err != nil { + return false, err.Error() + } + + accepted := check.AcceptedStatuses + if len(accepted) == 0 { + accepted = []int{200, 201, 202, 203, 204, 301, 302, 303, 307, 308, 401, 403} + } + statusOk := false + for _, code := range accepted { + if result.Status == code { + statusOk = true + break + } + } + if !statusOk { + return false, fmt.Sprintf("unexpected status code=%d", result.Status) + } + + locationContains := strings.TrimSpace(check.LocationContains) + if locationContains != "" && !checklistContains(result.Location, locationContains) { + return false, fmt.Sprintf("location header missing expected marker %q", locationContains) + } + + locationNotContains := strings.TrimSpace(check.LocationNotContains) + if locationNotContains != "" && checklistContains(result.Location, locationNotContains) { + return false, fmt.Sprintf("location header contained forbidden marker %q", locationNotContains) + } + + bodyContains := strings.TrimSpace(check.BodyContains) + if bodyContains != "" && !checklistContains(result.Body, bodyContains) { + return false, fmt.Sprintf("response missing expected marker %q", bodyContains) + } + + bodyNotContains := strings.TrimSpace(check.BodyNotContains) + if bodyNotContains != "" && checklistContains(result.Body, bodyNotContains) { + return false, fmt.Sprintf("response contained forbidden marker %q", bodyNotContains) + } + + return true, fmt.Sprintf("status=%d", result.Status) +} + +type checklistHTTPProbeResult struct { + Status int + Body string + Location string +} + +// httpChecklistProbeResult runs one orchestration or CLI step. +// Signature: (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error). +// Why: checklist checks need response headers (for redirect verification) in +// addition to status/body so startup can validate real user-facing behavior. +func (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error) { + result := checklistHTTPProbeResult{} + status, body, location, err := o.httpChecklistProbeWithLocation(ctx, check) + if err != nil { + return result, err + } + result.Status = status + result.Body = body + result.Location = location + return result, nil +} + +// httpChecklistProbe runs one orchestration or CLI step. +// Signature: (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error) { + status, body, _, err := o.httpChecklistProbeWithLocation(ctx, check) + return status, body, err +} + +// httpChecklistProbeWithLocation runs one orchestration or CLI step. +// Signature: (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error). +// Why: redirects and auth gates require location-header assertions to prevent +// startup false-positives on partially healthy protected services. +func (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error) { + timeout := time.Duration(check.TimeoutSeconds) * time.Second + if timeout <= 0 { + timeout = 12 * time.Second + } + + transport := &http.Transport{} + if check.InsecureSkipTLS { + transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + } + client := &http.Client{ + Timeout: timeout, + Transport: transport, + CheckRedirect: func(_ *http.Request, _ []*http.Request) error { + return http.ErrUseLastResponse + }, + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimSpace(check.URL), nil) + if err != nil { + return 0, "", "", fmt.Errorf("build request: %w", err) + } + req.Header.Set("User-Agent", "ananke/startup-checklist") + + resp, err := client.Do(req) + if err != nil { + return 0, "", "", fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + body, readErr := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) + if readErr != nil { + return resp.StatusCode, "", "", fmt.Errorf("read response body: %w", readErr) + } + + return resp.StatusCode, string(body), strings.TrimSpace(resp.Header.Get("Location")), nil +} + +// checklistContains runs one orchestration or CLI step. +// Signature: checklistContains(body, marker string) bool. +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func checklistContains(body, marker string) bool { + bodyLower := strings.ToLower(body) + markerLower := strings.ToLower(marker) + if strings.Contains(bodyLower, markerLower) { + return true + } + bodyCompact := compactLowerNoSpace(bodyLower) + markerCompact := compactLowerNoSpace(markerLower) + if markerCompact == "" { + return true + } + return strings.Contains(bodyCompact, markerCompact) +} + +// compactLowerNoSpace runs one orchestration or CLI step. +// Signature: compactLowerNoSpace(s string) string. +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func compactLowerNoSpace(s string) string { + var b strings.Builder + b.Grow(len(s)) + for _, r := range s { + if unicode.IsSpace(r) { + continue + } + b.WriteRune(r) + } + return b.String() +} + +// waitForStabilityWindow runs one orchestration or CLI step. +// Signature: (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error. +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error { + window := time.Duration(o.cfg.Startup.ServiceChecklistStabilitySec) * time.Second + if window <= 0 { + return nil + } + poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second + if poll <= 0 { + poll = 5 * time.Second + } + deadline := time.Now().Add(window) + lastStatus := time.Time{} + lastRecycleAttempt := time.Time{} + lastReplicaHeal := time.Time{} + + for { + o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) + o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal) + if err := o.startupStabilityHealthy(ctx); err != nil { + return fmt.Errorf("startup stability window failed: %w", err) + } + if time.Now().After(deadline) { + o.log.Printf("startup stability window passed (%s)", window) + return nil + } + if time.Since(lastStatus) >= 30*time.Second { + remaining := time.Until(deadline).Round(time.Second) + if remaining < 0 { + remaining = 0 + } + o.log.Printf("startup stability soak in progress (%s remaining)", remaining) + lastStatus = time.Now() + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(poll): + } + } +} + +// startupStabilityHealthy runs one orchestration or CLI step. +// Signature: (o *Orchestrator) startupStabilityHealthy(ctx context.Context) error. +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func (o *Orchestrator) startupStabilityHealthy(ctx context.Context) error { + if o.cfg.Startup.RequireFluxHealth { + ready, detail, err := o.fluxHealthReady(ctx) + if err != nil { + return fmt.Errorf("flux check error: %w", err) + } + if !ready { + return fmt.Errorf("flux not ready: %s", detail) + } + } + if o.cfg.Startup.RequireWorkloadConvergence { + ready, detail, err := o.workloadConvergenceReady(ctx) + if err != nil { + return fmt.Errorf("workload check error: %w", err) + } + if !ready { + return fmt.Errorf("workloads not converged: %s", detail) + } + } + if o.cfg.Startup.RequireServiceChecklist { + ready, detail := o.serviceChecklistReady(ctx) + if !ready { + return fmt.Errorf("external services not healthy: %s", detail) + } + } + if o.cfg.Startup.RequireIngressChecklist { + ready, detail := o.ingressChecklistReady(ctx) + if !ready { + return fmt.Errorf("ingress reachability not healthy: %s", detail) + } + } + failures, err := o.startupFailurePods(ctx) + if err != nil { + return fmt.Errorf("pod failure check error: %w", err) + } + if len(failures) > 0 { + return fmt.Errorf("pods in crash/image-pull failures: %s", joinLimited(failures, 8)) + } + return nil +} diff --git a/internal/cluster/orchestrator_test.go b/internal/cluster/orchestrator_test.go index c9254ca..30ef6c5 100644 --- a/internal/cluster/orchestrator_test.go +++ b/internal/cluster/orchestrator_test.go @@ -15,6 +15,9 @@ import ( "scm.bstein.dev/bstein/ananke/internal/state" ) +// TestParseVaultSealed runs one orchestration or CLI step. +// Signature: TestParseVaultSealed(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestParseVaultSealed(t *testing.T) { sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`) if err != nil { @@ -33,12 +36,18 @@ func TestParseVaultSealed(t *testing.T) { } } +// TestParseVaultSealedRejectsEmpty runs one orchestration or CLI step. +// Signature: TestParseVaultSealedRejectsEmpty(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestParseVaultSealedRejectsEmpty(t *testing.T) { if _, err := parseVaultSealed(" "); err == nil { t.Fatalf("expected parse error for empty status payload") } } +// TestParseVaultSealedWithKubectlPreamble runs one orchestration or CLI step. +// Signature: TestParseVaultSealedWithKubectlPreamble(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestParseVaultSealedWithKubectlPreamble(t *testing.T) { raw := "Defaulted container \"vault\" out of: vault, setup-config (init)\n{\"sealed\":true,\"initialized\":true}\n" sealed, err := parseVaultSealed(raw) @@ -50,6 +59,9 @@ func TestParseVaultSealedWithKubectlPreamble(t *testing.T) { } } +// TestFallbackWorkersFromInventoryUsesManagedNodes runs one orchestration or CLI step. +// Signature: TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) { orch := &Orchestrator{ cfg: config.Config{ @@ -70,6 +82,9 @@ func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) { } } +// TestFallbackWorkersFromInventoryFallsBackToHosts runs one orchestration or CLI step. +// Signature: TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) { orch := &Orchestrator{ cfg: config.Config{ @@ -89,12 +104,18 @@ func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) { } } +// TestIntentFreshTreatsZeroTimestampAsFresh runs one orchestration or CLI step. +// Signature: TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T) { if !intentFresh(state.Intent{}, 30*time.Second) { t.Fatalf("zero updated_at intent should be treated as fresh") } } +// TestIntentFreshRespectsAge runs one orchestration or CLI step. +// Signature: TestIntentFreshRespectsAge(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestIntentFreshRespectsAge(t *testing.T) { stale := state.Intent{UpdatedAt: time.Now().Add(-2 * time.Minute)} fresh := state.Intent{UpdatedAt: time.Now().Add(-20 * time.Second)} @@ -106,6 +127,9 @@ func TestIntentFreshRespectsAge(t *testing.T) { } } +// TestCoordinationPeersDedupesAndIncludesForwardHost runs one orchestration or CLI step. +// Signature: TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) { orch := &Orchestrator{ cfg: config.Config{ @@ -122,6 +146,9 @@ func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) { } } +// TestWorkloadTargetsIgnoredNodesByNodeSelector runs one orchestration or CLI step. +// Signature: TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) { spec := podSpec{ NodeSelector: map[string]string{ @@ -134,6 +161,9 @@ func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) { } } +// TestParseWorkloadIgnoreRules runs one orchestration or CLI step. +// Signature: TestParseWorkloadIgnoreRules(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestParseWorkloadIgnoreRules(t *testing.T) { rules := parseWorkloadIgnoreRules([]string{ "maintenance/metis", @@ -153,6 +183,9 @@ func TestParseWorkloadIgnoreRules(t *testing.T) { } } +// TestNamespaceCandidatesFromIgnoreKustomizations runs one orchestration or CLI step. +// Signature: TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) { got := namespaceCandidatesFromIgnoreKustomizations([]string{ "flux-system/jellyfin", @@ -166,12 +199,18 @@ func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) { } } +// TestProbeStatusAcceptedRejects404 runs one orchestration or CLI step. +// Signature: TestProbeStatusAcceptedRejects404(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestProbeStatusAcceptedRejects404(t *testing.T) { if probeStatusAccepted("https://metrics.bstein.dev/login", 404) { t.Fatalf("expected 404 probe status to be rejected") } } +// TestParseFluxKustomizationTimeout runs one orchestration or CLI step. +// Signature: TestParseFluxKustomizationTimeout(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestParseFluxKustomizationTimeout(t *testing.T) { if got := parseFluxKustomizationTimeout("30m"); got != 30*time.Minute { t.Fatalf("expected 30m duration, got %s", got) @@ -187,6 +226,9 @@ func TestParseFluxKustomizationTimeout(t *testing.T) { } } +// TestServiceCheckReadyRequiresBodyContains runs one orchestration or CLI step. +// Signature: TestServiceCheckReadyRequiresBodyContains(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestServiceCheckReadyRequiresBodyContains(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) @@ -209,6 +251,9 @@ func TestServiceCheckReadyRequiresBodyContains(t *testing.T) { } } +// TestServiceCheckReadyBodyContainsIgnoresWhitespace runs one orchestration or CLI step. +// Signature: TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) @@ -231,6 +276,62 @@ func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) { } } +// TestServiceCheckReadyRequiresLocationContains runs one orchestration or CLI step. +// Signature: TestServiceCheckReadyRequiresLocationContains(t *testing.T). +// Why: startup checks must validate redirect targets for OIDC-gated services. +func TestServiceCheckReadyRequiresLocationContains(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=logs") + w.WriteHeader(http.StatusFound) + })) + defer srv.Close() + + orch := &Orchestrator{ + log: log.New(os.Stdout, "", 0), + } + ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{ + Name: "logging-oidc-redirect", + URL: srv.URL, + AcceptedStatuses: []int{302}, + LocationContains: "client_id=logs", + TimeoutSeconds: 5, + }) + if !ok { + t.Fatalf("expected location-aware service check to pass, detail=%s", detail) + } +} + +// TestServiceCheckReadyRejectsMissingLocationMarker runs one orchestration or CLI step. +// Signature: TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T). +// Why: prevents false positives when redirects point somewhere unexpected. +func TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=wrong") + w.WriteHeader(http.StatusFound) + })) + defer srv.Close() + + orch := &Orchestrator{ + log: log.New(os.Stdout, "", 0), + } + ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{ + Name: "logging-oidc-redirect", + URL: srv.URL, + AcceptedStatuses: []int{302}, + LocationContains: "client_id=logs", + TimeoutSeconds: 5, + }) + if ok { + t.Fatalf("expected location-aware service check to fail") + } + if !strings.Contains(detail, "location header missing expected marker") { + t.Fatalf("expected missing location marker detail, got %q", detail) + } +} + +// TestChecklistFailureHostFromIngressDetail runs one orchestration or CLI step. +// Signature: TestChecklistFailureHostFromIngressDetail(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestChecklistFailureHostFromIngressDetail(t *testing.T) { orch := &Orchestrator{} got := orch.checklistFailureHost("cloud.bstein.dev: unexpected status code=500") @@ -239,6 +340,9 @@ func TestChecklistFailureHostFromIngressDetail(t *testing.T) { } } +// TestChecklistFailureHostFromServiceCheckName runs one orchestration or CLI step. +// Signature: TestChecklistFailureHostFromServiceCheckName(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestChecklistFailureHostFromServiceCheckName(t *testing.T) { orch := &Orchestrator{ cfg: config.Config{ @@ -258,6 +362,9 @@ func TestChecklistFailureHostFromServiceCheckName(t *testing.T) { } } +// TestChecklistFailureHostUnknown runs one orchestration or CLI step. +// Signature: TestChecklistFailureHostUnknown(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestChecklistFailureHostUnknown(t *testing.T) { orch := &Orchestrator{ cfg: config.Config{ @@ -279,6 +386,9 @@ func TestChecklistFailureHostUnknown(t *testing.T) { } } +// TestStuckVaultInitReasonDetectsHungInit runs one orchestration or CLI step. +// Signature: TestStuckVaultInitReasonDetectsHungInit(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) { var pod podResource pod.Status.Phase = "Pending" @@ -302,6 +412,9 @@ func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) { } } +// TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods runs one orchestration or CLI step. +// Signature: TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) { var pod podResource pod.Status.Phase = "Pending" @@ -328,70 +441,3 @@ func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) { t.Fatalf("expected no reason for non-vault pod, got %q", reason) } } - -func TestValidateNodeInventoryPassesForStrictMappings(t *testing.T) { - orch := &Orchestrator{ - cfg: config.Config{ - SSHUser: "atlas", - SSHPort: 2277, - SSHNodeHosts: map[string]string{ - "titan-0a": "192.168.22.11", - "titan-0b": "192.168.22.12", - "titan-0c": "192.168.22.13", - "titan-22": "192.168.22.22", - }, - SSHManagedNodes: []string{"titan-0a", "titan-0b", "titan-0c", "titan-22"}, - ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"}, - Workers: []string{"titan-22"}, - }, - log: log.New(os.Stdout, "", 0), - } - if err := orch.validateNodeInventory(); err != nil { - t.Fatalf("expected inventory to pass, got error: %v", err) - } -} - -func TestValidateNodeInventoryFailsWhenNodeMappingMissing(t *testing.T) { - orch := &Orchestrator{ - cfg: config.Config{ - SSHUser: "atlas", - SSHPort: 2277, - SSHNodeHosts: map[string]string{"titan-0a": "192.168.22.11"}, - SSHManagedNodes: []string{"titan-0a", "titan-0b"}, - ControlPlanes: []string{"titan-0a"}, - Workers: []string{"titan-0b"}, - }, - log: log.New(os.Stdout, "", 0), - } - err := orch.validateNodeInventory() - if err == nil { - t.Fatalf("expected inventory error for missing mapping") - } - if !strings.Contains(err.Error(), "missing ssh_node_hosts entry") { - t.Fatalf("expected missing-mapping detail, got: %v", err) - } -} - -func TestValidateNodeInventoryFailsWhenWorkerNotManaged(t *testing.T) { - orch := &Orchestrator{ - cfg: config.Config{ - SSHUser: "atlas", - SSHPort: 2277, - SSHNodeHosts: map[string]string{ - "titan-0a": "192.168.22.11", - "titan-22": "192.168.22.22", - }, - SSHManagedNodes: []string{"titan-0a"}, - ControlPlanes: []string{"titan-0a"}, - Workers: []string{"titan-22"}, - }, - log: log.New(os.Stdout, "", 0), - } - err := orch.validateNodeInventory() - if err == nil { - t.Fatalf("expected inventory error for unmanaged worker") - } - if !strings.Contains(err.Error(), "missing from ssh_managed_nodes") { - t.Fatalf("expected unmanaged-worker detail, got: %v", err) - } -} diff --git a/internal/config/apply_defaults.go b/internal/config/apply_defaults.go new file mode 100644 index 0000000..0a6a75b --- /dev/null +++ b/internal/config/apply_defaults.go @@ -0,0 +1,236 @@ +package config + +import "strings" + +// applyDefaults runs one orchestration or CLI step. +// Signature: (c *Config) applyDefaults(). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func (c *Config) applyDefaults() { + if c.ExpectedFluxBranch == "" { + c.ExpectedFluxBranch = "main" + } + if c.IACRepoPath == "" { + c.IACRepoPath = "/opt/titan-iac" + } + if c.ExpectedFluxSource == "" { + c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git" + } + if c.Startup.APIWaitSeconds <= 0 { + c.Startup.APIWaitSeconds = 1200 + } + if c.Startup.APIPollSeconds <= 0 { + c.Startup.APIPollSeconds = 2 + } + if c.Startup.ShutdownCooldownSeconds <= 0 { + c.Startup.ShutdownCooldownSeconds = 45 + } + if c.Startup.MinimumBatteryPercent <= 0 { + c.Startup.MinimumBatteryPercent = 20 + } + if c.Startup.NodeInventoryReachWaitSeconds <= 0 { + c.Startup.NodeInventoryReachWaitSeconds = 300 + } + if c.Startup.NodeInventoryReachPollSeconds <= 0 { + c.Startup.NodeInventoryReachPollSeconds = 5 + } + if c.Startup.RequiredNodeLabels == nil { + c.Startup.RequiredNodeLabels = map[string]map[string]string{ + "titan-09": { + "ananke.bstein.dev/harbor-bootstrap": "true", + }, + } + } + if c.Startup.TimeSyncWaitSeconds <= 0 { + c.Startup.TimeSyncWaitSeconds = 240 + } + if c.Startup.TimeSyncPollSeconds <= 0 { + c.Startup.TimeSyncPollSeconds = 5 + } + if c.Startup.TimeSyncMode == "" { + c.Startup.TimeSyncMode = "quorum" + } + if c.Startup.TimeSyncQuorum <= 0 { + c.Startup.TimeSyncQuorum = 2 + } + if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 { + c.Startup.TimeSyncQuorum = len(c.ControlPlanes) + } + if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 { + c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0] + } + if c.Startup.StorageReadyWaitSeconds <= 0 { + c.Startup.StorageReadyWaitSeconds = 420 + } + if c.Startup.StorageReadyPollSeconds <= 0 { + c.Startup.StorageReadyPollSeconds = 5 + } + if c.Startup.StorageMinReadyNodes <= 0 { + c.Startup.StorageMinReadyNodes = 2 + } + if len(c.Startup.StorageCriticalPVCs) == 0 { + c.Startup.StorageCriticalPVCs = []string{ + "vault/data-vault-0", + "postgres/postgres-data-postgres-0", + "gitea/gitea-data", + "sso/keycloak-data", + } + } + if c.Startup.PostStartProbeWaitSeconds <= 0 { + c.Startup.PostStartProbeWaitSeconds = 240 + } + if c.Startup.PostStartProbePollSeconds <= 0 { + c.Startup.PostStartProbePollSeconds = 5 + } + if len(c.Startup.PostStartProbes) == 0 { + c.Startup.PostStartProbes = []string{ + "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", + "https://scm.bstein.dev/api/healthz", + "https://metrics.bstein.dev/api/health", + } + } + if c.Startup.ServiceChecklistWaitSeconds <= 0 { + c.Startup.ServiceChecklistWaitSeconds = 420 + } + if c.Startup.ServiceChecklistPollSeconds <= 0 { + c.Startup.ServiceChecklistPollSeconds = 5 + } + if c.Startup.ServiceChecklistStabilitySec < 0 { + c.Startup.ServiceChecklistStabilitySec = 0 + } + c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist()) + for i := range c.Startup.ServiceChecklist { + if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 { + c.Startup.ServiceChecklist[i].TimeoutSeconds = 12 + } + } + if c.Startup.CriticalServiceEndpointWaitSec <= 0 { + c.Startup.CriticalServiceEndpointWaitSec = 420 + } + if c.Startup.CriticalServiceEndpointPollSec <= 0 { + c.Startup.CriticalServiceEndpointPollSec = 5 + } + c.Startup.CriticalServiceEndpoints = mergeStringDefaults(c.Startup.CriticalServiceEndpoints, defaultCriticalServiceEndpoints()) + if c.Startup.IngressChecklistWaitSeconds <= 0 { + c.Startup.IngressChecklistWaitSeconds = 420 + } + if c.Startup.IngressChecklistPollSeconds <= 0 { + c.Startup.IngressChecklistPollSeconds = 5 + } + if len(c.Startup.IngressChecklistAccepted) == 0 { + c.Startup.IngressChecklistAccepted = []int{200, 301, 302, 307, 308, 401, 403, 404} + } + if c.Startup.IngressChecklistIgnoreHosts == nil { + c.Startup.IngressChecklistIgnoreHosts = []string{} + } + if c.Startup.NodeSSHAuthWaitSeconds <= 0 { + c.Startup.NodeSSHAuthWaitSeconds = 240 + } + if c.Startup.NodeSSHAuthPollSeconds <= 0 { + c.Startup.NodeSSHAuthPollSeconds = 5 + } + if c.Startup.FluxHealthWaitSeconds <= 0 { + c.Startup.FluxHealthWaitSeconds = 900 + } + if c.Startup.FluxHealthPollSeconds <= 0 { + c.Startup.FluxHealthPollSeconds = 5 + } + if c.Startup.IgnoreFluxKustomizations == nil { + c.Startup.IgnoreFluxKustomizations = []string{} + } + if c.Startup.WorkloadConvergenceWaitSeconds <= 0 { + c.Startup.WorkloadConvergenceWaitSeconds = 900 + } + if c.Startup.WorkloadConvergencePollSeconds <= 0 { + c.Startup.WorkloadConvergencePollSeconds = 5 + } + if c.Startup.IgnoreWorkloadNamespaces == nil { + c.Startup.IgnoreWorkloadNamespaces = []string{} + } + if c.Startup.IgnoreWorkloads == nil { + c.Startup.IgnoreWorkloads = []string{} + } + if c.Startup.IgnoreUnavailableNodes == nil { + c.Startup.IgnoreUnavailableNodes = []string{} + } + if c.Startup.StuckPodGraceSeconds <= 0 { + c.Startup.StuckPodGraceSeconds = 180 + } + if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { + c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key" + } + if c.Startup.VaultUnsealBreakglassTimeout <= 0 { + c.Startup.VaultUnsealBreakglassTimeout = 15 + } + if c.SSHPort <= 0 { + c.SSHPort = 2277 + } + if c.Shutdown.DefaultBudgetSeconds <= 0 { + c.Shutdown.DefaultBudgetSeconds = 1380 + } + if c.Shutdown.HistoryMinSamples <= 0 { + c.Shutdown.HistoryMinSamples = 3 + } + if c.Shutdown.EmergencyBudgetSec <= 0 { + c.Shutdown.EmergencyBudgetSec = 420 + } + if c.Shutdown.EmergencyMinSamples <= 0 { + c.Shutdown.EmergencyMinSamples = 3 + } + if c.Shutdown.DrainParallelism <= 0 { + c.Shutdown.DrainParallelism = 6 + } + if c.Shutdown.ScaleParallelism <= 0 { + c.Shutdown.ScaleParallelism = 8 + } + if c.Shutdown.SSHParallelism <= 0 { + c.Shutdown.SSHParallelism = 8 + } + if c.UPS.PollSeconds <= 0 { + c.UPS.PollSeconds = 5 + } + if c.UPS.RuntimeSafetyFactor <= 0 { + c.UPS.RuntimeSafetyFactor = 1.25 + } + if c.UPS.DebounceCount <= 0 { + c.UPS.DebounceCount = 3 + } + if c.UPS.TelemetryTimeoutSeconds <= 0 { + c.UPS.TelemetryTimeoutSeconds = 90 + } + if c.Coordination.ForwardShutdownConfig == "" { + c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml" + } + if c.Coordination.PeerHosts == nil { + c.Coordination.PeerHosts = []string{} + } + if c.Coordination.CommandTimeoutSeconds <= 0 { + c.Coordination.CommandTimeoutSeconds = 25 + } + if c.Coordination.StartupGuardMaxAgeSec <= 0 { + c.Coordination.StartupGuardMaxAgeSec = 900 + } + if c.Coordination.Role == "" { + c.Coordination.Role = "coordinator" + } + if c.Metrics.BindAddr == "" { + c.Metrics.BindAddr = "0.0.0.0:9560" + } + if c.Metrics.Path == "" { + c.Metrics.Path = "/metrics" + } + if c.State.Dir == "" { + c.State.Dir = "/var/lib/ananke" + } + if c.State.ReportsDir == "" { + c.State.ReportsDir = "/var/lib/ananke/reports" + } + if c.State.RunHistoryPath == "" { + c.State.RunHistoryPath = "/var/lib/ananke/runs.json" + } + if c.State.LockPath == "" { + c.State.LockPath = "/var/lib/ananke/ananke.lock" + } + if c.State.IntentPath == "" { + c.State.IntentPath = "/var/lib/ananke/intent.json" + } +} diff --git a/internal/config/config_test.go b/internal/config/config_test.go index d4af8fa..07f6f61 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -7,6 +7,9 @@ import ( "testing" ) +// TestLoadAcceptsUPSTargets runs one orchestration or CLI step. +// Signature: TestLoadAcceptsUPSTargets(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestLoadAcceptsUPSTargets(t *testing.T) { tmp := t.TempDir() cfgPath := filepath.Join(tmp, "ananke.yaml") @@ -39,6 +42,9 @@ state: } } +// TestValidateForwardShutdownRequiresConfigPath runs one orchestration or CLI step. +// Signature: TestValidateForwardShutdownRequiresConfigPath(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) { cfg := defaults() cfg.Coordination.ForwardShutdownHost = "titan-db" @@ -48,6 +54,9 @@ func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) { } } +// TestValidateRejectsUnknownRole runs one orchestration or CLI step. +// Signature: TestValidateRejectsUnknownRole(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsUnknownRole(t *testing.T) { cfg := defaults() cfg.Coordination.Role = "unknown" @@ -56,6 +65,9 @@ func TestValidateRejectsUnknownRole(t *testing.T) { } } +// TestValidateRejectsEmptyPeerHostEntry runs one orchestration or CLI step. +// Signature: TestValidateRejectsEmptyPeerHostEntry(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) { cfg := defaults() cfg.Coordination.PeerHosts = []string{"titan-24", " "} @@ -64,6 +76,9 @@ func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) { } } +// TestValidateRejectsUnknownEtcdRestoreControlPlane runs one orchestration or CLI step. +// Signature: TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) { cfg := defaults() cfg.Startup.EtcdRestoreControlPlane = "titan-missing" @@ -72,6 +87,9 @@ func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) { } } +// TestLoadSetsCoordinationGuardDefaults runs one orchestration or CLI step. +// Signature: TestLoadSetsCoordinationGuardDefaults(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestLoadSetsCoordinationGuardDefaults(t *testing.T) { tmp := t.TempDir() cfgPath := filepath.Join(tmp, "ananke.yaml") @@ -114,6 +132,9 @@ state: } } +// TestValidateRejectsInvalidStartupShutdownCooldown runs one orchestration or CLI step. +// Signature: TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) { cfg := defaults() cfg.Startup.ShutdownCooldownSeconds = 0 @@ -122,6 +143,9 @@ func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) { } } +// TestValidateRejectsInvalidTimeSyncMode runs one orchestration or CLI step. +// Signature: TestValidateRejectsInvalidTimeSyncMode(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) { cfg := defaults() cfg.Startup.TimeSyncMode = "invalid" @@ -130,6 +154,9 @@ func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) { } } +// TestValidateRejectsBadStoragePVCFormat runs one orchestration or CLI step. +// Signature: TestValidateRejectsBadStoragePVCFormat(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsBadStoragePVCFormat(t *testing.T) { cfg := defaults() cfg.Startup.StorageCriticalPVCs = []string{"vault-data-vault-0"} @@ -138,6 +165,9 @@ func TestValidateRejectsBadStoragePVCFormat(t *testing.T) { } } +// TestValidateRejectsMissingPostStartProbesWhenRequired runs one orchestration or CLI step. +// Signature: TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) { cfg := defaults() cfg.Startup.RequirePostStartProbes = true @@ -147,6 +177,9 @@ func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) { } } +// TestValidateRejectsMissingServiceChecklistWhenRequired runs one orchestration or CLI step. +// Signature: TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) { cfg := defaults() cfg.Startup.RequireServiceChecklist = true @@ -156,6 +189,9 @@ func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) { } } +// TestValidateRejectsBadServiceChecklistURL runs one orchestration or CLI step. +// Signature: TestValidateRejectsBadServiceChecklistURL(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsBadServiceChecklistURL(t *testing.T) { cfg := defaults() cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{ @@ -171,6 +207,9 @@ func TestValidateRejectsBadServiceChecklistURL(t *testing.T) { } } +// TestValidateRejectsBadIgnoreFluxKustomizationFormat runs one orchestration or CLI step. +// Signature: TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) { cfg := defaults() cfg.Startup.IgnoreFluxKustomizations = []string{"jellyfin"} @@ -179,6 +218,9 @@ func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) { } } +// TestValidateRejectsBadIgnoreWorkloadFormat runs one orchestration or CLI step. +// Signature: TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) { cfg := defaults() cfg.Startup.IgnoreWorkloads = []string{"maintenance/metis/extra/value"} @@ -187,6 +229,9 @@ func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) { } } +// TestValidateRejectsInvalidRequiredNodeLabel runs one orchestration or CLI step. +// Signature: TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) { cfg := defaults() cfg.Startup.RequiredNodeLabels = map[string]map[string]string{ @@ -198,3 +243,85 @@ func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) { t.Fatalf("expected validation error for invalid required_node_labels entry") } } + +// TestValidateRejectsInvalidNodeInventoryReachWindow runs one orchestration or CLI step. +// Signature: TestValidateRejectsInvalidNodeInventoryReachWindow(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func TestValidateRejectsInvalidNodeInventoryReachWindow(t *testing.T) { + cfg := defaults() + cfg.Startup.NodeInventoryReachWaitSeconds = 0 + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for invalid node_inventory_reachability_wait_seconds") + } +} + +// TestValidateRejectsMissingReportsDir runs one orchestration or CLI step. +// Signature: TestValidateRejectsMissingReportsDir(t *testing.T). +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func TestValidateRejectsMissingReportsDir(t *testing.T) { + cfg := defaults() + cfg.State.ReportsDir = "" + if err := cfg.Validate(); err == nil { + t.Fatalf("expected validation error for missing state.reports_dir") + } +} + +// TestApplyDefaultsMergesServiceChecklistDefaults runs one orchestration or CLI step. +// Signature: TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T). +// Why: host configs may define a partial checklist; startup still needs the +// baseline service validations learned from drills. +func TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T) { + cfg := Config{ + Startup: Startup{ + ServiceChecklist: []ServiceChecklistCheck{ + { + Name: "custom-smoke", + URL: "https://example.invalid/healthz", + TimeoutSeconds: 7, + }, + }, + }, + } + cfg.applyDefaults() + + names := map[string]struct{}{} + for _, check := range cfg.Startup.ServiceChecklist { + names[check.Name] = struct{}{} + } + if _, ok := names["custom-smoke"]; !ok { + t.Fatalf("expected custom checklist entry to be preserved") + } + if _, ok := names["logging-oidc-redirect"]; !ok { + t.Fatalf("expected default logging redirect check to be merged in") + } + if _, ok := names["vaultwarden-ui"]; !ok { + t.Fatalf("expected default vaultwarden check to be merged in") + } +} + +// TestApplyDefaultsMergesCriticalServiceEndpointDefaults runs one orchestration or CLI step. +// Signature: TestApplyDefaultsMergesCriticalServiceEndpointDefaults(t *testing.T). +// Why: startup endpoint gating must keep baseline backend checks even when host +// configs only provide a subset. +func TestApplyDefaultsMergesCriticalServiceEndpointDefaults(t *testing.T) { + cfg := Config{ + Startup: Startup{ + CriticalServiceEndpoints: []string{"customns/customsvc"}, + }, + } + cfg.applyDefaults() + + seen := map[string]struct{}{} + for _, entry := range cfg.Startup.CriticalServiceEndpoints { + seen[entry] = struct{}{} + } + if _, ok := seen["customns/customsvc"]; !ok { + t.Fatalf("expected custom critical endpoint to be preserved") + } + if _, ok := seen["logging/opensearch-dashboards"]; !ok { + t.Fatalf("expected logging/opensearch-dashboards critical endpoint default") + } + if _, ok := seen["monitoring/victoria-metrics-single-server"]; !ok { + t.Fatalf("expected monitoring/victoria-metrics-single-server critical endpoint default") + } +} diff --git a/internal/config/defaults.go b/internal/config/defaults.go new file mode 100644 index 0000000..a848a40 --- /dev/null +++ b/internal/config/defaults.go @@ -0,0 +1,155 @@ +package config + +// defaults runs one orchestration or CLI step. +// Signature: defaults() Config. +// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. +func defaults() Config { + c := Config{ + IACRepoPath: "/opt/titan-iac", + ExpectedFluxBranch: "main", + ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git", + SSHPort: 2277, + ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"}, + LocalBootstrapPaths: []string{ + "infrastructure/core", + "clusters/atlas/flux-system", + "infrastructure/sources/helm", + "infrastructure/metallb", + "infrastructure/traefik", + "infrastructure/cert-manager", + "infrastructure/vault-csi", + "infrastructure/vault-injector", + "services/vault", + "infrastructure/postgres", + "services/gitea", + "services/keycloak", + "services/oauth2-proxy", + }, + ExcludedNamespaces: []string{ + "kube-system", + "kube-public", + "kube-node-lease", + "flux-system", + "traefik", + "metallb-system", + "cert-manager", + "longhorn-system", + "vault", + "postgres", + "maintenance", + }, + Startup: Startup{ + APIWaitSeconds: 1200, + APIPollSeconds: 2, + ShutdownCooldownSeconds: 45, + RequireNodeInventoryReach: true, + NodeInventoryReachWaitSeconds: 300, + NodeInventoryReachPollSeconds: 5, + RequireTimeSync: true, + TimeSyncWaitSeconds: 240, + TimeSyncPollSeconds: 5, + TimeSyncMode: "quorum", + TimeSyncQuorum: 2, + ReconcileAccessOnBoot: true, + AutoEtcdRestoreOnAPIFailure: true, + EtcdRestoreControlPlane: "titan-0a", + RequireStorageReady: true, + StorageReadyWaitSeconds: 420, + StorageReadyPollSeconds: 5, + StorageMinReadyNodes: 2, + StorageCriticalPVCs: []string{ + "vault/data-vault-0", + "postgres/postgres-data-postgres-0", + "gitea/gitea-data", + "sso/keycloak-data", + }, + MinimumBatteryPercent: 20, + RequiredNodeLabels: map[string]map[string]string{ + "titan-09": { + "ananke.bstein.dev/harbor-bootstrap": "true", + }, + }, + RequirePostStartProbes: true, + PostStartProbeWaitSeconds: 240, + PostStartProbePollSeconds: 5, + PostStartProbes: []string{ + "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", + "https://scm.bstein.dev/api/healthz", + "https://metrics.bstein.dev/api/health", + }, + RequireServiceChecklist: true, + ServiceChecklistWaitSeconds: 420, + ServiceChecklistPollSeconds: 5, + ServiceChecklistStabilitySec: 120, + ServiceChecklist: defaultServiceChecklist(), + RequireCriticalServiceEndpoints: true, + CriticalServiceEndpointWaitSec: 420, + CriticalServiceEndpointPollSec: 5, + CriticalServiceEndpoints: defaultCriticalServiceEndpoints(), + RequireIngressChecklist: true, + IngressChecklistWaitSeconds: 420, + IngressChecklistPollSeconds: 5, + IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404}, + IngressChecklistIgnoreHosts: []string{}, + RequireNodeSSHAuth: true, + NodeSSHAuthWaitSeconds: 240, + NodeSSHAuthPollSeconds: 5, + RequireFluxHealth: true, + FluxHealthWaitSeconds: 900, + FluxHealthPollSeconds: 5, + IgnoreFluxKustomizations: []string{}, + RequireWorkloadConvergence: true, + WorkloadConvergenceWaitSeconds: 900, + WorkloadConvergencePollSeconds: 5, + IgnoreWorkloadNamespaces: []string{}, + IgnoreWorkloads: []string{}, + IgnoreUnavailableNodes: []string{}, + AutoRecycleStuckPods: true, + StuckPodGraceSeconds: 180, + VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key", + VaultUnsealBreakglassTimeout: 15, + }, + Shutdown: Shutdown{ + DefaultBudgetSeconds: 1380, + HistoryMinSamples: 3, + EmergencyBudgetSec: 420, + EmergencyMinSamples: 3, + EmergencySkipEtcd: true, + EmergencySkipDrain: true, + DrainParallelism: 6, + ScaleParallelism: 8, + SSHParallelism: 8, + }, + UPS: UPS{ + Enabled: true, + Provider: "nut", + PollSeconds: 5, + RuntimeSafetyFactor: 1.25, + DebounceCount: 3, + TelemetryTimeoutSeconds: 90, + }, + Coordination: Coordination{ + ForwardShutdownConfig: "/etc/ananke/ananke.yaml", + PeerHosts: []string{}, + FallbackLocalShutdown: true, + CommandTimeoutSeconds: 25, + StartupGuardMaxAgeSec: 900, + Role: "coordinator", + AllowStartupOnBattery: false, + }, + Metrics: Metrics{ + Enabled: true, + BindAddr: "0.0.0.0:9560", + Path: "/metrics", + }, + State: State{ + Dir: "/var/lib/ananke", + ReportsDir: "/var/lib/ananke/reports", + RunHistoryPath: "/var/lib/ananke/runs.json", + LockPath: "/var/lib/ananke/ananke.lock", + IntentPath: "/var/lib/ananke/intent.json", + }, + } + c.applyDefaults() + return c +} diff --git a/internal/config/startup_service_catalog.go b/internal/config/startup_service_catalog.go new file mode 100644 index 0000000..922ede8 --- /dev/null +++ b/internal/config/startup_service_catalog.go @@ -0,0 +1,315 @@ +package config + +import "strings" + +// defaultServiceChecklist runs one orchestration or CLI step. +// Signature: defaultServiceChecklist() []ServiceChecklistCheck. +// Why: startup must verify real external behavior per service (not only generic +// ingress reachability) so false positives do not pass drills. +func defaultServiceChecklist() []ServiceChecklistCheck { + return []ServiceChecklistCheck{ + { + Name: "gitea-api", + URL: "https://scm.bstein.dev/api/healthz", + AcceptedStatuses: []int{200}, + BodyContains: "pass", + TimeoutSeconds: 12, + }, + { + Name: "grafana-api", + URL: "https://metrics.bstein.dev/api/health", + AcceptedStatuses: []int{200}, + BodyContains: "\"database\":\"ok\"", + TimeoutSeconds: 12, + }, + { + Name: "keycloak-oidc", + URL: "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", + AcceptedStatuses: []int{200}, + BodyContains: "\"issuer\":\"https://sso.bstein.dev/realms/atlas\"", + TimeoutSeconds: 12, + }, + { + Name: "harbor-registry-api", + URL: "https://registry.bstein.dev/v2/", + AcceptedStatuses: []int{401}, + BodyContains: "unauthorized", + TimeoutSeconds: 12, + }, + { + Name: "alerts-ui", + URL: "https://alerts.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "Alertmanager", + TimeoutSeconds: 12, + }, + { + Name: "auth-gateway-redirect", + URL: "https://auth.bstein.dev/", + AcceptedStatuses: []int{302}, + LocationContains: "https://sso.bstein.dev/realms/atlas/", + TimeoutSeconds: 12, + }, + { + Name: "home-site", + URL: "https://bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "Titan Lab", + TimeoutSeconds: 12, + }, + { + Name: "actual-budget-ui", + URL: "https://budget.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "Actual", + TimeoutSeconds: 12, + }, + { + Name: "element-call-ui", + URL: "https://call.live.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "Element Call", + TimeoutSeconds: 12, + }, + { + Name: "flux-gitops-ui", + URL: "https://cd.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "Weave GitOps", + TimeoutSeconds: 12, + }, + { + Name: "chat-ai-health", + URL: "https://chat.ai.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "\"ok\": true", + TimeoutSeconds: 12, + }, + { + Name: "jenkins-auth-gate", + URL: "https://ci.bstein.dev/", + AcceptedStatuses: []int{403}, + BodyContains: "commenceLogin", + TimeoutSeconds: 12, + }, + { + Name: "nextcloud-login-redirect", + URL: "https://cloud.bstein.dev/", + AcceptedStatuses: []int{302}, + LocationContains: "/index.php/login", + TimeoutSeconds: 12, + }, + { + Name: "wger-redirect", + URL: "https://health.bstein.dev/", + AcceptedStatuses: []int{302}, + LocationContains: "/en/", + TimeoutSeconds: 12, + }, + { + Name: "livekit-edge", + URL: "https://kit.live.bstein.dev/", + AcceptedStatuses: []int{404}, + BodyContains: "404 page not found", + TimeoutSeconds: 12, + }, + { + Name: "element-web-ui", + URL: "https://live.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "<title>Element", + TimeoutSeconds: 12, + }, + { + Name: "logging-oidc-redirect", + URL: "https://logs.bstein.dev/", + AcceptedStatuses: []int{302}, + LocationContains: "client_id=logs", + TimeoutSeconds: 12, + }, + { + Name: "longhorn-oidc-redirect", + URL: "https://longhorn.bstein.dev/", + AcceptedStatuses: []int{302}, + LocationContains: "https://sso.bstein.dev/realms/atlas/", + TimeoutSeconds: 12, + }, + { + Name: "matrix-auth-ui", + URL: "https://matrix.live.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "matrix-authentication-service", + TimeoutSeconds: 12, + }, + { + Name: "monero-edge", + URL: "https://monero.bstein.dev/", + AcceptedStatuses: []int{404}, + TimeoutSeconds: 12, + }, + { + Name: "firefly-login-redirect", + URL: "https://money.bstein.dev/", + AcceptedStatuses: []int{302}, + LocationContains: "/login", + TimeoutSeconds: 12, + }, + { + Name: "outline-ui", + URL: "https://notes.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "Outline", + TimeoutSeconds: 12, + }, + { + Name: "collabora-probe", + URL: "https://office.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "OK", + TimeoutSeconds: 12, + }, + { + Name: "pegasus-ui", + URL: "https://pegasus.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "Pegasus", + TimeoutSeconds: 12, + }, + { + Name: "harbor-ui", + URL: "https://registry.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "Harbor", + TimeoutSeconds: 12, + }, + { + Name: "vault-ui-redirect", + URL: "https://secret.bstein.dev/", + AcceptedStatuses: []int{307}, + LocationContains: "/ui/", + TimeoutSeconds: 12, + }, + { + Name: "sentinel-oidc-redirect", + URL: "https://sentinel.bstein.dev/", + AcceptedStatuses: []int{302}, + LocationContains: "client_id=metis", + TimeoutSeconds: 12, + }, + { + Name: "keycloak-admin-redirect", + URL: "https://sso.bstein.dev/", + AcceptedStatuses: []int{302}, + LocationContains: "https://sso.bstein.dev/admin/", + TimeoutSeconds: 12, + }, + { + Name: "jellyfin-edge", + URL: "https://stream.bstein.dev/", + AcceptedStatuses: []int{302}, + LocationContains: "web/", + TimeoutSeconds: 12, + }, + { + Name: "planka-ui", + URL: "https://tasks.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "PLANKA", + TimeoutSeconds: 12, + }, + { + Name: "vaultwarden-ui", + URL: "https://vault.bstein.dev/", + AcceptedStatuses: []int{200}, + BodyContains: "Vaultwarden Web", + TimeoutSeconds: 12, + }, + } +} + +// defaultCriticalServiceEndpoints runs one orchestration or CLI step. +// Signature: defaultCriticalServiceEndpoints() []string. +// Why: service edge checks are insufficient for protected stacks; endpoint +// presence verifies that backends are actually routable before startup success. +func defaultCriticalServiceEndpoints() []string { + return []string{ + "monitoring/victoria-metrics-single-server", + "monitoring/grafana", + "monitoring/kube-state-metrics", + "logging/oauth2-proxy-logs", + "logging/opensearch-dashboards", + "logging/opensearch-master", + } +} + +// mergeServiceChecklistDefaults runs one orchestration or CLI step. +// Signature: mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck. +// Why: host configs can keep custom checks while still inheriting mandatory +// baseline checks introduced after incident learnings. +func mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck { + if len(existing) == 0 { + out := make([]ServiceChecklistCheck, 0, len(defaults)) + out = append(out, defaults...) + return out + } + + byName := map[string]struct{}{} + for _, check := range existing { + name := strings.TrimSpace(check.Name) + if name == "" { + continue + } + byName[name] = struct{}{} + } + + out := make([]ServiceChecklistCheck, 0, len(existing)+len(defaults)) + out = append(out, existing...) + for _, check := range defaults { + name := strings.TrimSpace(check.Name) + if name == "" { + continue + } + if _, exists := byName[name]; exists { + continue + } + out = append(out, check) + } + return out +} + +// mergeStringDefaults runs one orchestration or CLI step. +// Signature: mergeStringDefaults(existing, defaults []string) []string. +// Why: keeps baseline startup guards applied while preserving site-specific +// additions already declared in host configs. +func mergeStringDefaults(existing, defaults []string) []string { + if len(existing) == 0 { + out := make([]string, 0, len(defaults)) + out = append(out, defaults...) + return out + } + seen := map[string]struct{}{} + out := make([]string, 0, len(existing)+len(defaults)) + for _, item := range existing { + key := strings.TrimSpace(item) + if key == "" { + continue + } + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + out = append(out, key) + } + for _, item := range defaults { + key := strings.TrimSpace(item) + if key == "" { + continue + } + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + out = append(out, key) + } + return out +} diff --git a/internal/config/types.go b/internal/config/types.go new file mode 100644 index 0000000..a253c8f --- /dev/null +++ b/internal/config/types.go @@ -0,0 +1,156 @@ +package config + +type Config struct { + Kubeconfig string `yaml:"kubeconfig"` + SSHUser string `yaml:"ssh_user"` + SSHPort int `yaml:"ssh_port"` + SSHConfigFile string `yaml:"ssh_config_file"` + SSHIdentityFile string `yaml:"ssh_identity_file"` + SSHNodeHosts map[string]string `yaml:"ssh_node_hosts"` + SSHNodeUsers map[string]string `yaml:"ssh_node_users"` + SSHManagedNodes []string `yaml:"ssh_managed_nodes"` + SSHJumpHost string `yaml:"ssh_jump_host"` + SSHJumpUser string `yaml:"ssh_jump_user"` + IACRepoPath string `yaml:"iac_repo_path"` + ExpectedFluxBranch string `yaml:"expected_flux_branch"` + ExpectedFluxSource string `yaml:"expected_flux_source_url"` + ControlPlanes []string `yaml:"control_planes"` + Workers []string `yaml:"workers"` + LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"` + ExcludedNamespaces []string `yaml:"excluded_namespaces"` + Startup Startup `yaml:"startup"` + Shutdown Shutdown `yaml:"shutdown"` + UPS UPS `yaml:"ups"` + Coordination Coordination `yaml:"coordination"` + Metrics Metrics `yaml:"metrics"` + State State `yaml:"state"` +} + +type Startup struct { + APIWaitSeconds int `yaml:"api_wait_seconds"` + APIPollSeconds int `yaml:"api_poll_seconds"` + ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"` + MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"` + RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"` + NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"` + NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"` + RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"` + RequireTimeSync bool `yaml:"require_time_sync"` + TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` + TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` + TimeSyncMode string `yaml:"time_sync_mode"` + TimeSyncQuorum int `yaml:"time_sync_quorum"` + ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` + AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` + EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` + RequireStorageReady bool `yaml:"require_storage_ready"` + StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` + StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` + StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` + StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` + RequirePostStartProbes bool `yaml:"require_post_start_probes"` + PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` + PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` + PostStartProbes []string `yaml:"post_start_probes"` + RequireServiceChecklist bool `yaml:"require_service_checklist"` + ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"` + ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"` + ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"` + ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"` + RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"` + CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"` + CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"` + CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"` + RequireIngressChecklist bool `yaml:"require_ingress_checklist"` + IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"` + IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"` + IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"` + IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"` + IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"` + RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"` + NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"` + NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"` + RequireFluxHealth bool `yaml:"require_flux_health"` + FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"` + FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"` + IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"` + RequireWorkloadConvergence bool `yaml:"require_workload_convergence"` + WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"` + WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"` + IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"` + IgnoreWorkloads []string `yaml:"ignore_workloads"` + IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"` + AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"` + StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"` + VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` + VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"` + VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"` +} + +type ServiceChecklistCheck struct { + Name string `yaml:"name"` + URL string `yaml:"url"` + AcceptedStatuses []int `yaml:"accepted_statuses"` + LocationContains string `yaml:"location_contains"` + LocationNotContains string `yaml:"location_not_contains"` + BodyContains string `yaml:"body_contains"` + BodyNotContains string `yaml:"body_not_contains"` + TimeoutSeconds int `yaml:"timeout_seconds"` + InsecureSkipTLS bool `yaml:"insecure_skip_tls"` +} + +type Shutdown struct { + DefaultBudgetSeconds int `yaml:"default_budget_seconds"` + HistoryMinSamples int `yaml:"history_min_samples"` + EmergencyBudgetSec int `yaml:"emergency_budget_seconds"` + EmergencyMinSamples int `yaml:"emergency_history_min_samples"` + EmergencySkipEtcd bool `yaml:"emergency_skip_etcd_snapshot"` + EmergencySkipDrain bool `yaml:"emergency_skip_drain"` + SkipEtcdSnapshot bool `yaml:"skip_etcd_snapshot"` + SkipDrain bool `yaml:"skip_drain"` + DrainParallelism int `yaml:"drain_parallelism"` + ScaleParallelism int `yaml:"scale_parallelism"` + SSHParallelism int `yaml:"ssh_parallelism"` +} + +type UPS struct { + Enabled bool `yaml:"enabled"` + Provider string `yaml:"provider"` + Target string `yaml:"target"` + Targets []UPSTarget `yaml:"targets"` + PollSeconds int `yaml:"poll_seconds"` + RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"` + DebounceCount int `yaml:"debounce_count"` + TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"` +} + +type UPSTarget struct { + Name string `yaml:"name"` + Target string `yaml:"target"` +} + +type Coordination struct { + ForwardShutdownHost string `yaml:"forward_shutdown_host"` + ForwardShutdownUser string `yaml:"forward_shutdown_user"` + ForwardShutdownConfig string `yaml:"forward_shutdown_config"` + PeerHosts []string `yaml:"peer_hosts"` + FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"` + CommandTimeoutSeconds int `yaml:"command_timeout_seconds"` + StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"` + Role string `yaml:"role"` + AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"` +} + +type Metrics struct { + Enabled bool `yaml:"enabled"` + BindAddr string `yaml:"bind_addr"` + Path string `yaml:"path"` +} + +type State struct { + Dir string `yaml:"dir"` + ReportsDir string `yaml:"reports_dir"` + RunHistoryPath string `yaml:"run_history_path"` + LockPath string `yaml:"lock_path"` + IntentPath string `yaml:"intent_path"` +}