startup: enforce external service behavior checks

This commit is contained in:
Brad Stein 2026-04-08 23:42:09 -03:00
parent 296ca85c78
commit 95fefba244
8 changed files with 1615 additions and 67 deletions

View File

@ -0,0 +1,124 @@
package cluster
import (
"context"
"errors"
"log"
"regexp"
"sync"
"time"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
type Orchestrator struct {
cfg config.Config
runner *execx.Runner
store *state.Store
log *log.Logger
runOverride func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
runSensitiveOverride func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
startupReportMu sync.Mutex
activeStartupReport *startupReport
}
type commandOverrideFunc func(timeoutCtx context.Context, timeout time.Duration, name string, args ...string) (string, error)
type StartupOptions struct {
ForceFluxBranch string
SkipLocalBootstrap bool
Reason string
}
type ShutdownOptions struct {
SkipEtcdSnapshot bool
SkipDrain bool
Mode string
Reason string
}
type EtcdRestoreOptions struct {
ControlPlane string
SnapshotPath string
}
type startupWorkload struct {
Namespace string
Kind string
Name string
}
type workloadScaleEntry struct {
Namespace string `json:"namespace"`
Kind string `json:"kind"`
Name string `json:"name"`
Replicas int `json:"replicas"`
}
type remotePeerStatus struct {
Intent state.Intent
BootstrapActive bool
}
type workloadScaleSnapshot struct {
GeneratedAt time.Time `json:"generated_at"`
Entries []workloadScaleEntry `json:"entries"`
}
type startupReport struct {
StartedAt time.Time `json:"started_at"`
Completed time.Time `json:"completed_at"`
Reason string `json:"reason"`
Status string `json:"status"`
Phase string `json:"phase"`
Success bool `json:"success"`
Error string `json:"error,omitempty"`
Checks map[string]startupCheckRecord `json:"checks"`
AutoHeals []string `json:"auto_heals"`
SourceHost string `json:"source_host"`
LastUpdated time.Time `json:"last_updated"`
}
type startupCheckRecord struct {
Status string `json:"status"`
Detail string `json:"detail"`
UpdatedAt time.Time `json:"updated_at"`
}
var datastoreEndpointPattern = regexp.MustCompile(`--datastore-endpoint(?:=|\s+)(?:'([^']+)'|"([^"]+)"|([^\s\\]+))`)
var criticalStartupWorkloads = []startupWorkload{
{Namespace: "flux-system", Kind: "deployment", Name: "source-controller"},
{Namespace: "flux-system", Kind: "deployment", Name: "kustomize-controller"},
{Namespace: "flux-system", Kind: "deployment", Name: "helm-controller"},
{Namespace: "flux-system", Kind: "deployment", Name: "notification-controller"},
{Namespace: "vault", Kind: "statefulset", Name: "vault"},
{Namespace: "postgres", Kind: "statefulset", Name: "postgres"},
{Namespace: "gitea", Kind: "deployment", Name: "gitea"},
{Namespace: "monitoring", Kind: "deployment", Name: "grafana"},
{Namespace: "monitoring", Kind: "statefulset", Name: "victoria-metrics-single-server"},
{Namespace: "monitoring", Kind: "deployment", Name: "kube-state-metrics"},
{Namespace: "logging", Kind: "deployment", Name: "oauth2-proxy-logs"},
{Namespace: "logging", Kind: "deployment", Name: "opensearch-dashboards"},
{Namespace: "logging", Kind: "statefulset", Name: "opensearch"},
}
var ErrEtcdRestoreNotApplicable = errors.New("etcd restore not applicable")
// New runs one orchestration or CLI step.
// Signature: New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func New(cfg config.Config, runner *execx.Runner, store *state.Store, logger *log.Logger) *Orchestrator {
return &Orchestrator{cfg: cfg, runner: runner, store: store, log: logger}
}
// SetCommandOverrides runs one orchestration or CLI step.
// Signature: (o *Orchestrator) SetCommandOverrides(run commandOverrideFunc, runSensitive commandOverrideFunc).
// Why: enables deterministic integration testing from the top-level testing module
// without requiring package-local test files or live cluster dependencies.
func (o *Orchestrator) SetCommandOverrides(run commandOverrideFunc, runSensitive commandOverrideFunc) {
o.runOverride = run
o.runSensitiveOverride = runSensitive
}

View File

@ -0,0 +1,389 @@
package cluster
import (
"context"
"crypto/tls"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
"unicode"
"scm.bstein.dev/bstein/ananke/internal/config"
)
// isLikelyHostname runs one orchestration or CLI step.
// Signature: isLikelyHostname(value string) bool.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func isLikelyHostname(value string) bool {
value = strings.TrimSpace(value)
if value == "" {
return false
}
if strings.Contains(value, " ") || strings.Contains(value, "/") {
return false
}
return strings.Contains(value, ".")
}
// healIngressHostBackendReplicas runs one orchestration or CLI step.
// Signature: (o *Orchestrator) healIngressHostBackendReplicas(ctx context.Context, host string) ([]string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) healIngressHostBackendReplicas(ctx context.Context, host string) ([]string, error) {
namespaces, err := o.discoverIngressNamespacesForHost(ctx, host)
if err != nil {
return nil, err
}
if len(namespaces) == 0 {
return nil, nil
}
targetNamespaces := makeStringSet(namespaces)
out, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
if err != nil {
return nil, fmt.Errorf("query workloads: %w", err)
}
var list workloadList
if err := json.Unmarshal([]byte(out), &list); err != nil {
return nil, fmt.Errorf("decode workloads: %w", err)
}
healed := []string{}
for _, item := range list.Items {
kind := strings.ToLower(strings.TrimSpace(item.Kind))
ns := strings.TrimSpace(item.Metadata.Namespace)
name := strings.TrimSpace(item.Metadata.Name)
if kind == "" || ns == "" || name == "" {
continue
}
if kind != "deployment" && kind != "statefulset" {
continue
}
if _, ok := targetNamespaces[ns]; !ok {
continue
}
desired := int32(1)
if item.Spec.Replicas != nil {
desired = *item.Spec.Replicas
}
if desired >= 1 {
continue
}
workload := startupWorkload{Namespace: ns, Kind: kind, Name: name}
if err := o.ensureWorkloadReplicas(ctx, workload, 1); err != nil {
if isNotFoundErr(err) {
continue
}
return healed, fmt.Errorf("scale %s/%s/%s to 1: %w", ns, kind, name, err)
}
healed = append(healed, ns+"/"+kind+"/"+name)
}
return healed, nil
}
// waitForServiceChecklist runs one orchestration or CLI step.
// Signature: (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error {
wait := time.Duration(o.cfg.Startup.ServiceChecklistWaitSeconds) * time.Second
if wait <= 0 {
wait = 7 * time.Minute
}
poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second
if poll <= 0 {
poll = 5 * time.Second
}
deadline := time.Now().Add(wait)
lastFailure := "unknown"
lastLogged := time.Time{}
lastRecycleAttempt := time.Time{}
lastReplicaHeal := time.Time{}
lastIngressHeal := time.Time{}
for {
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
prevFailure := lastFailure
ready, detail := o.serviceChecklistReady(ctx)
lastFailure = detail
if ready {
o.log.Printf("external service checklist passed (%s)", detail)
return nil
}
o.maybeAutoHealIngressHostBackends(ctx, &lastIngressHeal, lastFailure)
if lastFailure != prevFailure || time.Since(lastLogged) >= 30*time.Second {
remaining := time.Until(deadline).Round(time.Second)
if remaining < 0 {
remaining = 0
}
o.log.Printf("waiting for external service checklist (%s remaining): %s", remaining, lastFailure)
lastLogged = time.Now()
}
if time.Now().After(deadline) {
return fmt.Errorf("startup blocked: external service checklist not satisfied within %s (%s)", wait, lastFailure)
}
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(poll):
}
}
}
// serviceChecklistReady runs one orchestration or CLI step.
// Signature: (o *Orchestrator) serviceChecklistReady(ctx context.Context) (bool, string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) serviceChecklistReady(ctx context.Context) (bool, string) {
checks := o.cfg.Startup.ServiceChecklist
if len(checks) == 0 {
return true, "no checklist items configured"
}
for _, check := range checks {
ok, detail := o.serviceCheckReady(ctx, check)
if !ok {
name := strings.TrimSpace(check.Name)
if name == "" {
name = strings.TrimSpace(check.URL)
}
return false, fmt.Sprintf("%s: %s", name, detail)
}
}
return true, fmt.Sprintf("checks=%d", len(checks))
}
// serviceCheckReady runs one orchestration or CLI step.
// Signature: (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.ServiceChecklistCheck) (bool, string).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) serviceCheckReady(ctx context.Context, check config.ServiceChecklistCheck) (bool, string) {
result, err := o.httpChecklistProbeResult(ctx, check)
if err != nil {
return false, err.Error()
}
accepted := check.AcceptedStatuses
if len(accepted) == 0 {
accepted = []int{200, 201, 202, 203, 204, 301, 302, 303, 307, 308, 401, 403}
}
statusOk := false
for _, code := range accepted {
if result.Status == code {
statusOk = true
break
}
}
if !statusOk {
return false, fmt.Sprintf("unexpected status code=%d", result.Status)
}
locationContains := strings.TrimSpace(check.LocationContains)
if locationContains != "" && !checklistContains(result.Location, locationContains) {
return false, fmt.Sprintf("location header missing expected marker %q", locationContains)
}
locationNotContains := strings.TrimSpace(check.LocationNotContains)
if locationNotContains != "" && checklistContains(result.Location, locationNotContains) {
return false, fmt.Sprintf("location header contained forbidden marker %q", locationNotContains)
}
bodyContains := strings.TrimSpace(check.BodyContains)
if bodyContains != "" && !checklistContains(result.Body, bodyContains) {
return false, fmt.Sprintf("response missing expected marker %q", bodyContains)
}
bodyNotContains := strings.TrimSpace(check.BodyNotContains)
if bodyNotContains != "" && checklistContains(result.Body, bodyNotContains) {
return false, fmt.Sprintf("response contained forbidden marker %q", bodyNotContains)
}
return true, fmt.Sprintf("status=%d", result.Status)
}
type checklistHTTPProbeResult struct {
Status int
Body string
Location string
}
// httpChecklistProbeResult runs one orchestration or CLI step.
// Signature: (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error).
// Why: checklist checks need response headers (for redirect verification) in
// addition to status/body so startup can validate real user-facing behavior.
func (o *Orchestrator) httpChecklistProbeResult(ctx context.Context, check config.ServiceChecklistCheck) (checklistHTTPProbeResult, error) {
result := checklistHTTPProbeResult{}
status, body, location, err := o.httpChecklistProbeWithLocation(ctx, check)
if err != nil {
return result, err
}
result.Status = status
result.Body = body
result.Location = location
return result, nil
}
// httpChecklistProbe runs one orchestration or CLI step.
// Signature: (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) httpChecklistProbe(ctx context.Context, check config.ServiceChecklistCheck) (int, string, error) {
status, body, _, err := o.httpChecklistProbeWithLocation(ctx, check)
return status, body, err
}
// httpChecklistProbeWithLocation runs one orchestration or CLI step.
// Signature: (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error).
// Why: redirects and auth gates require location-header assertions to prevent
// startup false-positives on partially healthy protected services.
func (o *Orchestrator) httpChecklistProbeWithLocation(ctx context.Context, check config.ServiceChecklistCheck) (int, string, string, error) {
timeout := time.Duration(check.TimeoutSeconds) * time.Second
if timeout <= 0 {
timeout = 12 * time.Second
}
transport := &http.Transport{}
if check.InsecureSkipTLS {
transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
}
client := &http.Client{
Timeout: timeout,
Transport: transport,
CheckRedirect: func(_ *http.Request, _ []*http.Request) error {
return http.ErrUseLastResponse
},
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimSpace(check.URL), nil)
if err != nil {
return 0, "", "", fmt.Errorf("build request: %w", err)
}
req.Header.Set("User-Agent", "ananke/startup-checklist")
resp, err := client.Do(req)
if err != nil {
return 0, "", "", fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
body, readErr := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
if readErr != nil {
return resp.StatusCode, "", "", fmt.Errorf("read response body: %w", readErr)
}
return resp.StatusCode, string(body), strings.TrimSpace(resp.Header.Get("Location")), nil
}
// checklistContains runs one orchestration or CLI step.
// Signature: checklistContains(body, marker string) bool.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func checklistContains(body, marker string) bool {
bodyLower := strings.ToLower(body)
markerLower := strings.ToLower(marker)
if strings.Contains(bodyLower, markerLower) {
return true
}
bodyCompact := compactLowerNoSpace(bodyLower)
markerCompact := compactLowerNoSpace(markerLower)
if markerCompact == "" {
return true
}
return strings.Contains(bodyCompact, markerCompact)
}
// compactLowerNoSpace runs one orchestration or CLI step.
// Signature: compactLowerNoSpace(s string) string.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func compactLowerNoSpace(s string) string {
var b strings.Builder
b.Grow(len(s))
for _, r := range s {
if unicode.IsSpace(r) {
continue
}
b.WriteRune(r)
}
return b.String()
}
// waitForStabilityWindow runs one orchestration or CLI step.
// Signature: (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
window := time.Duration(o.cfg.Startup.ServiceChecklistStabilitySec) * time.Second
if window <= 0 {
return nil
}
poll := time.Duration(o.cfg.Startup.ServiceChecklistPollSeconds) * time.Second
if poll <= 0 {
poll = 5 * time.Second
}
deadline := time.Now().Add(window)
lastStatus := time.Time{}
lastRecycleAttempt := time.Time{}
lastReplicaHeal := time.Time{}
for {
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
if err := o.startupStabilityHealthy(ctx); err != nil {
return fmt.Errorf("startup stability window failed: %w", err)
}
if time.Now().After(deadline) {
o.log.Printf("startup stability window passed (%s)", window)
return nil
}
if time.Since(lastStatus) >= 30*time.Second {
remaining := time.Until(deadline).Round(time.Second)
if remaining < 0 {
remaining = 0
}
o.log.Printf("startup stability soak in progress (%s remaining)", remaining)
lastStatus = time.Now()
}
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(poll):
}
}
}
// startupStabilityHealthy runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupStabilityHealthy(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (o *Orchestrator) startupStabilityHealthy(ctx context.Context) error {
if o.cfg.Startup.RequireFluxHealth {
ready, detail, err := o.fluxHealthReady(ctx)
if err != nil {
return fmt.Errorf("flux check error: %w", err)
}
if !ready {
return fmt.Errorf("flux not ready: %s", detail)
}
}
if o.cfg.Startup.RequireWorkloadConvergence {
ready, detail, err := o.workloadConvergenceReady(ctx)
if err != nil {
return fmt.Errorf("workload check error: %w", err)
}
if !ready {
return fmt.Errorf("workloads not converged: %s", detail)
}
}
if o.cfg.Startup.RequireServiceChecklist {
ready, detail := o.serviceChecklistReady(ctx)
if !ready {
return fmt.Errorf("external services not healthy: %s", detail)
}
}
if o.cfg.Startup.RequireIngressChecklist {
ready, detail := o.ingressChecklistReady(ctx)
if !ready {
return fmt.Errorf("ingress reachability not healthy: %s", detail)
}
}
failures, err := o.startupFailurePods(ctx)
if err != nil {
return fmt.Errorf("pod failure check error: %w", err)
}
if len(failures) > 0 {
return fmt.Errorf("pods in crash/image-pull failures: %s", joinLimited(failures, 8))
}
return nil
}

View File

@ -15,6 +15,9 @@ import (
"scm.bstein.dev/bstein/ananke/internal/state" "scm.bstein.dev/bstein/ananke/internal/state"
) )
// TestParseVaultSealed runs one orchestration or CLI step.
// Signature: TestParseVaultSealed(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseVaultSealed(t *testing.T) { func TestParseVaultSealed(t *testing.T) {
sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`) sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`)
if err != nil { if err != nil {
@ -33,12 +36,18 @@ func TestParseVaultSealed(t *testing.T) {
} }
} }
// TestParseVaultSealedRejectsEmpty runs one orchestration or CLI step.
// Signature: TestParseVaultSealedRejectsEmpty(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseVaultSealedRejectsEmpty(t *testing.T) { func TestParseVaultSealedRejectsEmpty(t *testing.T) {
if _, err := parseVaultSealed(" "); err == nil { if _, err := parseVaultSealed(" "); err == nil {
t.Fatalf("expected parse error for empty status payload") t.Fatalf("expected parse error for empty status payload")
} }
} }
// TestParseVaultSealedWithKubectlPreamble runs one orchestration or CLI step.
// Signature: TestParseVaultSealedWithKubectlPreamble(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseVaultSealedWithKubectlPreamble(t *testing.T) { func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
raw := "Defaulted container \"vault\" out of: vault, setup-config (init)\n{\"sealed\":true,\"initialized\":true}\n" raw := "Defaulted container \"vault\" out of: vault, setup-config (init)\n{\"sealed\":true,\"initialized\":true}\n"
sealed, err := parseVaultSealed(raw) sealed, err := parseVaultSealed(raw)
@ -50,6 +59,9 @@ func TestParseVaultSealedWithKubectlPreamble(t *testing.T) {
} }
} }
// TestFallbackWorkersFromInventoryUsesManagedNodes runs one orchestration or CLI step.
// Signature: TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) { func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
orch := &Orchestrator{ orch := &Orchestrator{
cfg: config.Config{ cfg: config.Config{
@ -70,6 +82,9 @@ func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) {
} }
} }
// TestFallbackWorkersFromInventoryFallsBackToHosts runs one orchestration or CLI step.
// Signature: TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) { func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
orch := &Orchestrator{ orch := &Orchestrator{
cfg: config.Config{ cfg: config.Config{
@ -89,12 +104,18 @@ func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) {
} }
} }
// TestIntentFreshTreatsZeroTimestampAsFresh runs one orchestration or CLI step.
// Signature: TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T) { func TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T) {
if !intentFresh(state.Intent{}, 30*time.Second) { if !intentFresh(state.Intent{}, 30*time.Second) {
t.Fatalf("zero updated_at intent should be treated as fresh") t.Fatalf("zero updated_at intent should be treated as fresh")
} }
} }
// TestIntentFreshRespectsAge runs one orchestration or CLI step.
// Signature: TestIntentFreshRespectsAge(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestIntentFreshRespectsAge(t *testing.T) { func TestIntentFreshRespectsAge(t *testing.T) {
stale := state.Intent{UpdatedAt: time.Now().Add(-2 * time.Minute)} stale := state.Intent{UpdatedAt: time.Now().Add(-2 * time.Minute)}
fresh := state.Intent{UpdatedAt: time.Now().Add(-20 * time.Second)} fresh := state.Intent{UpdatedAt: time.Now().Add(-20 * time.Second)}
@ -106,6 +127,9 @@ func TestIntentFreshRespectsAge(t *testing.T) {
} }
} }
// TestCoordinationPeersDedupesAndIncludesForwardHost runs one orchestration or CLI step.
// Signature: TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) { func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
orch := &Orchestrator{ orch := &Orchestrator{
cfg: config.Config{ cfg: config.Config{
@ -122,6 +146,9 @@ func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) {
} }
} }
// TestWorkloadTargetsIgnoredNodesByNodeSelector runs one orchestration or CLI step.
// Signature: TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) { func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
spec := podSpec{ spec := podSpec{
NodeSelector: map[string]string{ NodeSelector: map[string]string{
@ -134,6 +161,9 @@ func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) {
} }
} }
// TestParseWorkloadIgnoreRules runs one orchestration or CLI step.
// Signature: TestParseWorkloadIgnoreRules(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseWorkloadIgnoreRules(t *testing.T) { func TestParseWorkloadIgnoreRules(t *testing.T) {
rules := parseWorkloadIgnoreRules([]string{ rules := parseWorkloadIgnoreRules([]string{
"maintenance/metis", "maintenance/metis",
@ -153,6 +183,9 @@ func TestParseWorkloadIgnoreRules(t *testing.T) {
} }
} }
// TestNamespaceCandidatesFromIgnoreKustomizations runs one orchestration or CLI step.
// Signature: TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) { func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
got := namespaceCandidatesFromIgnoreKustomizations([]string{ got := namespaceCandidatesFromIgnoreKustomizations([]string{
"flux-system/jellyfin", "flux-system/jellyfin",
@ -166,12 +199,18 @@ func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) {
} }
} }
// TestProbeStatusAcceptedRejects404 runs one orchestration or CLI step.
// Signature: TestProbeStatusAcceptedRejects404(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestProbeStatusAcceptedRejects404(t *testing.T) { func TestProbeStatusAcceptedRejects404(t *testing.T) {
if probeStatusAccepted("https://metrics.bstein.dev/login", 404) { if probeStatusAccepted("https://metrics.bstein.dev/login", 404) {
t.Fatalf("expected 404 probe status to be rejected") t.Fatalf("expected 404 probe status to be rejected")
} }
} }
// TestParseFluxKustomizationTimeout runs one orchestration or CLI step.
// Signature: TestParseFluxKustomizationTimeout(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestParseFluxKustomizationTimeout(t *testing.T) { func TestParseFluxKustomizationTimeout(t *testing.T) {
if got := parseFluxKustomizationTimeout("30m"); got != 30*time.Minute { if got := parseFluxKustomizationTimeout("30m"); got != 30*time.Minute {
t.Fatalf("expected 30m duration, got %s", got) t.Fatalf("expected 30m duration, got %s", got)
@ -187,6 +226,9 @@ func TestParseFluxKustomizationTimeout(t *testing.T) {
} }
} }
// TestServiceCheckReadyRequiresBodyContains runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyRequiresBodyContains(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestServiceCheckReadyRequiresBodyContains(t *testing.T) { func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK) w.WriteHeader(http.StatusOK)
@ -209,6 +251,9 @@ func TestServiceCheckReadyRequiresBodyContains(t *testing.T) {
} }
} }
// TestServiceCheckReadyBodyContainsIgnoresWhitespace runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) { func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK) w.WriteHeader(http.StatusOK)
@ -231,6 +276,62 @@ func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
} }
} }
// TestServiceCheckReadyRequiresLocationContains runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyRequiresLocationContains(t *testing.T).
// Why: startup checks must validate redirect targets for OIDC-gated services.
func TestServiceCheckReadyRequiresLocationContains(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=logs")
w.WriteHeader(http.StatusFound)
}))
defer srv.Close()
orch := &Orchestrator{
log: log.New(os.Stdout, "", 0),
}
ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
Name: "logging-oidc-redirect",
URL: srv.URL,
AcceptedStatuses: []int{302},
LocationContains: "client_id=logs",
TimeoutSeconds: 5,
})
if !ok {
t.Fatalf("expected location-aware service check to pass, detail=%s", detail)
}
}
// TestServiceCheckReadyRejectsMissingLocationMarker runs one orchestration or CLI step.
// Signature: TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T).
// Why: prevents false positives when redirects point somewhere unexpected.
func TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=wrong")
w.WriteHeader(http.StatusFound)
}))
defer srv.Close()
orch := &Orchestrator{
log: log.New(os.Stdout, "", 0),
}
ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{
Name: "logging-oidc-redirect",
URL: srv.URL,
AcceptedStatuses: []int{302},
LocationContains: "client_id=logs",
TimeoutSeconds: 5,
})
if ok {
t.Fatalf("expected location-aware service check to fail")
}
if !strings.Contains(detail, "location header missing expected marker") {
t.Fatalf("expected missing location marker detail, got %q", detail)
}
}
// TestChecklistFailureHostFromIngressDetail runs one orchestration or CLI step.
// Signature: TestChecklistFailureHostFromIngressDetail(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestChecklistFailureHostFromIngressDetail(t *testing.T) { func TestChecklistFailureHostFromIngressDetail(t *testing.T) {
orch := &Orchestrator{} orch := &Orchestrator{}
got := orch.checklistFailureHost("cloud.bstein.dev: unexpected status code=500") got := orch.checklistFailureHost("cloud.bstein.dev: unexpected status code=500")
@ -239,6 +340,9 @@ func TestChecklistFailureHostFromIngressDetail(t *testing.T) {
} }
} }
// TestChecklistFailureHostFromServiceCheckName runs one orchestration or CLI step.
// Signature: TestChecklistFailureHostFromServiceCheckName(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestChecklistFailureHostFromServiceCheckName(t *testing.T) { func TestChecklistFailureHostFromServiceCheckName(t *testing.T) {
orch := &Orchestrator{ orch := &Orchestrator{
cfg: config.Config{ cfg: config.Config{
@ -258,6 +362,9 @@ func TestChecklistFailureHostFromServiceCheckName(t *testing.T) {
} }
} }
// TestChecklistFailureHostUnknown runs one orchestration or CLI step.
// Signature: TestChecklistFailureHostUnknown(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestChecklistFailureHostUnknown(t *testing.T) { func TestChecklistFailureHostUnknown(t *testing.T) {
orch := &Orchestrator{ orch := &Orchestrator{
cfg: config.Config{ cfg: config.Config{
@ -279,6 +386,9 @@ func TestChecklistFailureHostUnknown(t *testing.T) {
} }
} }
// TestStuckVaultInitReasonDetectsHungInit runs one orchestration or CLI step.
// Signature: TestStuckVaultInitReasonDetectsHungInit(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) { func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
var pod podResource var pod podResource
pod.Status.Phase = "Pending" pod.Status.Phase = "Pending"
@ -302,6 +412,9 @@ func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
} }
} }
// TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods runs one orchestration or CLI step.
// Signature: TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) { func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
var pod podResource var pod podResource
pod.Status.Phase = "Pending" pod.Status.Phase = "Pending"
@ -328,70 +441,3 @@ func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
t.Fatalf("expected no reason for non-vault pod, got %q", reason) t.Fatalf("expected no reason for non-vault pod, got %q", reason)
} }
} }
func TestValidateNodeInventoryPassesForStrictMappings(t *testing.T) {
orch := &Orchestrator{
cfg: config.Config{
SSHUser: "atlas",
SSHPort: 2277,
SSHNodeHosts: map[string]string{
"titan-0a": "192.168.22.11",
"titan-0b": "192.168.22.12",
"titan-0c": "192.168.22.13",
"titan-22": "192.168.22.22",
},
SSHManagedNodes: []string{"titan-0a", "titan-0b", "titan-0c", "titan-22"},
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
Workers: []string{"titan-22"},
},
log: log.New(os.Stdout, "", 0),
}
if err := orch.validateNodeInventory(); err != nil {
t.Fatalf("expected inventory to pass, got error: %v", err)
}
}
func TestValidateNodeInventoryFailsWhenNodeMappingMissing(t *testing.T) {
orch := &Orchestrator{
cfg: config.Config{
SSHUser: "atlas",
SSHPort: 2277,
SSHNodeHosts: map[string]string{"titan-0a": "192.168.22.11"},
SSHManagedNodes: []string{"titan-0a", "titan-0b"},
ControlPlanes: []string{"titan-0a"},
Workers: []string{"titan-0b"},
},
log: log.New(os.Stdout, "", 0),
}
err := orch.validateNodeInventory()
if err == nil {
t.Fatalf("expected inventory error for missing mapping")
}
if !strings.Contains(err.Error(), "missing ssh_node_hosts entry") {
t.Fatalf("expected missing-mapping detail, got: %v", err)
}
}
func TestValidateNodeInventoryFailsWhenWorkerNotManaged(t *testing.T) {
orch := &Orchestrator{
cfg: config.Config{
SSHUser: "atlas",
SSHPort: 2277,
SSHNodeHosts: map[string]string{
"titan-0a": "192.168.22.11",
"titan-22": "192.168.22.22",
},
SSHManagedNodes: []string{"titan-0a"},
ControlPlanes: []string{"titan-0a"},
Workers: []string{"titan-22"},
},
log: log.New(os.Stdout, "", 0),
}
err := orch.validateNodeInventory()
if err == nil {
t.Fatalf("expected inventory error for unmanaged worker")
}
if !strings.Contains(err.Error(), "missing from ssh_managed_nodes") {
t.Fatalf("expected unmanaged-worker detail, got: %v", err)
}
}

View File

@ -0,0 +1,236 @@
package config
import "strings"
// applyDefaults runs one orchestration or CLI step.
// Signature: (c *Config) applyDefaults().
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func (c *Config) applyDefaults() {
if c.ExpectedFluxBranch == "" {
c.ExpectedFluxBranch = "main"
}
if c.IACRepoPath == "" {
c.IACRepoPath = "/opt/titan-iac"
}
if c.ExpectedFluxSource == "" {
c.ExpectedFluxSource = "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
}
if c.Startup.APIWaitSeconds <= 0 {
c.Startup.APIWaitSeconds = 1200
}
if c.Startup.APIPollSeconds <= 0 {
c.Startup.APIPollSeconds = 2
}
if c.Startup.ShutdownCooldownSeconds <= 0 {
c.Startup.ShutdownCooldownSeconds = 45
}
if c.Startup.MinimumBatteryPercent <= 0 {
c.Startup.MinimumBatteryPercent = 20
}
if c.Startup.NodeInventoryReachWaitSeconds <= 0 {
c.Startup.NodeInventoryReachWaitSeconds = 300
}
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
c.Startup.NodeInventoryReachPollSeconds = 5
}
if c.Startup.RequiredNodeLabels == nil {
c.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
}
}
if c.Startup.TimeSyncWaitSeconds <= 0 {
c.Startup.TimeSyncWaitSeconds = 240
}
if c.Startup.TimeSyncPollSeconds <= 0 {
c.Startup.TimeSyncPollSeconds = 5
}
if c.Startup.TimeSyncMode == "" {
c.Startup.TimeSyncMode = "quorum"
}
if c.Startup.TimeSyncQuorum <= 0 {
c.Startup.TimeSyncQuorum = 2
}
if c.Startup.TimeSyncQuorum > len(c.ControlPlanes) && len(c.ControlPlanes) > 0 {
c.Startup.TimeSyncQuorum = len(c.ControlPlanes)
}
if c.Startup.EtcdRestoreControlPlane == "" && len(c.ControlPlanes) > 0 {
c.Startup.EtcdRestoreControlPlane = c.ControlPlanes[0]
}
if c.Startup.StorageReadyWaitSeconds <= 0 {
c.Startup.StorageReadyWaitSeconds = 420
}
if c.Startup.StorageReadyPollSeconds <= 0 {
c.Startup.StorageReadyPollSeconds = 5
}
if c.Startup.StorageMinReadyNodes <= 0 {
c.Startup.StorageMinReadyNodes = 2
}
if len(c.Startup.StorageCriticalPVCs) == 0 {
c.Startup.StorageCriticalPVCs = []string{
"vault/data-vault-0",
"postgres/postgres-data-postgres-0",
"gitea/gitea-data",
"sso/keycloak-data",
}
}
if c.Startup.PostStartProbeWaitSeconds <= 0 {
c.Startup.PostStartProbeWaitSeconds = 240
}
if c.Startup.PostStartProbePollSeconds <= 0 {
c.Startup.PostStartProbePollSeconds = 5
}
if len(c.Startup.PostStartProbes) == 0 {
c.Startup.PostStartProbes = []string{
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
"https://scm.bstein.dev/api/healthz",
"https://metrics.bstein.dev/api/health",
}
}
if c.Startup.ServiceChecklistWaitSeconds <= 0 {
c.Startup.ServiceChecklistWaitSeconds = 420
}
if c.Startup.ServiceChecklistPollSeconds <= 0 {
c.Startup.ServiceChecklistPollSeconds = 5
}
if c.Startup.ServiceChecklistStabilitySec < 0 {
c.Startup.ServiceChecklistStabilitySec = 0
}
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
for i := range c.Startup.ServiceChecklist {
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
}
}
if c.Startup.CriticalServiceEndpointWaitSec <= 0 {
c.Startup.CriticalServiceEndpointWaitSec = 420
}
if c.Startup.CriticalServiceEndpointPollSec <= 0 {
c.Startup.CriticalServiceEndpointPollSec = 5
}
c.Startup.CriticalServiceEndpoints = mergeStringDefaults(c.Startup.CriticalServiceEndpoints, defaultCriticalServiceEndpoints())
if c.Startup.IngressChecklistWaitSeconds <= 0 {
c.Startup.IngressChecklistWaitSeconds = 420
}
if c.Startup.IngressChecklistPollSeconds <= 0 {
c.Startup.IngressChecklistPollSeconds = 5
}
if len(c.Startup.IngressChecklistAccepted) == 0 {
c.Startup.IngressChecklistAccepted = []int{200, 301, 302, 307, 308, 401, 403, 404}
}
if c.Startup.IngressChecklistIgnoreHosts == nil {
c.Startup.IngressChecklistIgnoreHosts = []string{}
}
if c.Startup.NodeSSHAuthWaitSeconds <= 0 {
c.Startup.NodeSSHAuthWaitSeconds = 240
}
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
c.Startup.NodeSSHAuthPollSeconds = 5
}
if c.Startup.FluxHealthWaitSeconds <= 0 {
c.Startup.FluxHealthWaitSeconds = 900
}
if c.Startup.FluxHealthPollSeconds <= 0 {
c.Startup.FluxHealthPollSeconds = 5
}
if c.Startup.IgnoreFluxKustomizations == nil {
c.Startup.IgnoreFluxKustomizations = []string{}
}
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
c.Startup.WorkloadConvergenceWaitSeconds = 900
}
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
c.Startup.WorkloadConvergencePollSeconds = 5
}
if c.Startup.IgnoreWorkloadNamespaces == nil {
c.Startup.IgnoreWorkloadNamespaces = []string{}
}
if c.Startup.IgnoreWorkloads == nil {
c.Startup.IgnoreWorkloads = []string{}
}
if c.Startup.IgnoreUnavailableNodes == nil {
c.Startup.IgnoreUnavailableNodes = []string{}
}
if c.Startup.StuckPodGraceSeconds <= 0 {
c.Startup.StuckPodGraceSeconds = 180
}
if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" {
c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key"
}
if c.Startup.VaultUnsealBreakglassTimeout <= 0 {
c.Startup.VaultUnsealBreakglassTimeout = 15
}
if c.SSHPort <= 0 {
c.SSHPort = 2277
}
if c.Shutdown.DefaultBudgetSeconds <= 0 {
c.Shutdown.DefaultBudgetSeconds = 1380
}
if c.Shutdown.HistoryMinSamples <= 0 {
c.Shutdown.HistoryMinSamples = 3
}
if c.Shutdown.EmergencyBudgetSec <= 0 {
c.Shutdown.EmergencyBudgetSec = 420
}
if c.Shutdown.EmergencyMinSamples <= 0 {
c.Shutdown.EmergencyMinSamples = 3
}
if c.Shutdown.DrainParallelism <= 0 {
c.Shutdown.DrainParallelism = 6
}
if c.Shutdown.ScaleParallelism <= 0 {
c.Shutdown.ScaleParallelism = 8
}
if c.Shutdown.SSHParallelism <= 0 {
c.Shutdown.SSHParallelism = 8
}
if c.UPS.PollSeconds <= 0 {
c.UPS.PollSeconds = 5
}
if c.UPS.RuntimeSafetyFactor <= 0 {
c.UPS.RuntimeSafetyFactor = 1.25
}
if c.UPS.DebounceCount <= 0 {
c.UPS.DebounceCount = 3
}
if c.UPS.TelemetryTimeoutSeconds <= 0 {
c.UPS.TelemetryTimeoutSeconds = 90
}
if c.Coordination.ForwardShutdownConfig == "" {
c.Coordination.ForwardShutdownConfig = "/etc/ananke/ananke.yaml"
}
if c.Coordination.PeerHosts == nil {
c.Coordination.PeerHosts = []string{}
}
if c.Coordination.CommandTimeoutSeconds <= 0 {
c.Coordination.CommandTimeoutSeconds = 25
}
if c.Coordination.StartupGuardMaxAgeSec <= 0 {
c.Coordination.StartupGuardMaxAgeSec = 900
}
if c.Coordination.Role == "" {
c.Coordination.Role = "coordinator"
}
if c.Metrics.BindAddr == "" {
c.Metrics.BindAddr = "0.0.0.0:9560"
}
if c.Metrics.Path == "" {
c.Metrics.Path = "/metrics"
}
if c.State.Dir == "" {
c.State.Dir = "/var/lib/ananke"
}
if c.State.ReportsDir == "" {
c.State.ReportsDir = "/var/lib/ananke/reports"
}
if c.State.RunHistoryPath == "" {
c.State.RunHistoryPath = "/var/lib/ananke/runs.json"
}
if c.State.LockPath == "" {
c.State.LockPath = "/var/lib/ananke/ananke.lock"
}
if c.State.IntentPath == "" {
c.State.IntentPath = "/var/lib/ananke/intent.json"
}
}

View File

@ -7,6 +7,9 @@ import (
"testing" "testing"
) )
// TestLoadAcceptsUPSTargets runs one orchestration or CLI step.
// Signature: TestLoadAcceptsUPSTargets(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestLoadAcceptsUPSTargets(t *testing.T) { func TestLoadAcceptsUPSTargets(t *testing.T) {
tmp := t.TempDir() tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "ananke.yaml") cfgPath := filepath.Join(tmp, "ananke.yaml")
@ -39,6 +42,9 @@ state:
} }
} }
// TestValidateForwardShutdownRequiresConfigPath runs one orchestration or CLI step.
// Signature: TestValidateForwardShutdownRequiresConfigPath(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) { func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Coordination.ForwardShutdownHost = "titan-db" cfg.Coordination.ForwardShutdownHost = "titan-db"
@ -48,6 +54,9 @@ func TestValidateForwardShutdownRequiresConfigPath(t *testing.T) {
} }
} }
// TestValidateRejectsUnknownRole runs one orchestration or CLI step.
// Signature: TestValidateRejectsUnknownRole(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsUnknownRole(t *testing.T) { func TestValidateRejectsUnknownRole(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Coordination.Role = "unknown" cfg.Coordination.Role = "unknown"
@ -56,6 +65,9 @@ func TestValidateRejectsUnknownRole(t *testing.T) {
} }
} }
// TestValidateRejectsEmptyPeerHostEntry runs one orchestration or CLI step.
// Signature: TestValidateRejectsEmptyPeerHostEntry(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) { func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Coordination.PeerHosts = []string{"titan-24", " "} cfg.Coordination.PeerHosts = []string{"titan-24", " "}
@ -64,6 +76,9 @@ func TestValidateRejectsEmptyPeerHostEntry(t *testing.T) {
} }
} }
// TestValidateRejectsUnknownEtcdRestoreControlPlane runs one orchestration or CLI step.
// Signature: TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) { func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Startup.EtcdRestoreControlPlane = "titan-missing" cfg.Startup.EtcdRestoreControlPlane = "titan-missing"
@ -72,6 +87,9 @@ func TestValidateRejectsUnknownEtcdRestoreControlPlane(t *testing.T) {
} }
} }
// TestLoadSetsCoordinationGuardDefaults runs one orchestration or CLI step.
// Signature: TestLoadSetsCoordinationGuardDefaults(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestLoadSetsCoordinationGuardDefaults(t *testing.T) { func TestLoadSetsCoordinationGuardDefaults(t *testing.T) {
tmp := t.TempDir() tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "ananke.yaml") cfgPath := filepath.Join(tmp, "ananke.yaml")
@ -114,6 +132,9 @@ state:
} }
} }
// TestValidateRejectsInvalidStartupShutdownCooldown runs one orchestration or CLI step.
// Signature: TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) { func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Startup.ShutdownCooldownSeconds = 0 cfg.Startup.ShutdownCooldownSeconds = 0
@ -122,6 +143,9 @@ func TestValidateRejectsInvalidStartupShutdownCooldown(t *testing.T) {
} }
} }
// TestValidateRejectsInvalidTimeSyncMode runs one orchestration or CLI step.
// Signature: TestValidateRejectsInvalidTimeSyncMode(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) { func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Startup.TimeSyncMode = "invalid" cfg.Startup.TimeSyncMode = "invalid"
@ -130,6 +154,9 @@ func TestValidateRejectsInvalidTimeSyncMode(t *testing.T) {
} }
} }
// TestValidateRejectsBadStoragePVCFormat runs one orchestration or CLI step.
// Signature: TestValidateRejectsBadStoragePVCFormat(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsBadStoragePVCFormat(t *testing.T) { func TestValidateRejectsBadStoragePVCFormat(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Startup.StorageCriticalPVCs = []string{"vault-data-vault-0"} cfg.Startup.StorageCriticalPVCs = []string{"vault-data-vault-0"}
@ -138,6 +165,9 @@ func TestValidateRejectsBadStoragePVCFormat(t *testing.T) {
} }
} }
// TestValidateRejectsMissingPostStartProbesWhenRequired runs one orchestration or CLI step.
// Signature: TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) { func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Startup.RequirePostStartProbes = true cfg.Startup.RequirePostStartProbes = true
@ -147,6 +177,9 @@ func TestValidateRejectsMissingPostStartProbesWhenRequired(t *testing.T) {
} }
} }
// TestValidateRejectsMissingServiceChecklistWhenRequired runs one orchestration or CLI step.
// Signature: TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) { func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Startup.RequireServiceChecklist = true cfg.Startup.RequireServiceChecklist = true
@ -156,6 +189,9 @@ func TestValidateRejectsMissingServiceChecklistWhenRequired(t *testing.T) {
} }
} }
// TestValidateRejectsBadServiceChecklistURL runs one orchestration or CLI step.
// Signature: TestValidateRejectsBadServiceChecklistURL(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsBadServiceChecklistURL(t *testing.T) { func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{ cfg.Startup.ServiceChecklist = []ServiceChecklistCheck{
@ -171,6 +207,9 @@ func TestValidateRejectsBadServiceChecklistURL(t *testing.T) {
} }
} }
// TestValidateRejectsBadIgnoreFluxKustomizationFormat runs one orchestration or CLI step.
// Signature: TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) { func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Startup.IgnoreFluxKustomizations = []string{"jellyfin"} cfg.Startup.IgnoreFluxKustomizations = []string{"jellyfin"}
@ -179,6 +218,9 @@ func TestValidateRejectsBadIgnoreFluxKustomizationFormat(t *testing.T) {
} }
} }
// TestValidateRejectsBadIgnoreWorkloadFormat runs one orchestration or CLI step.
// Signature: TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) { func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Startup.IgnoreWorkloads = []string{"maintenance/metis/extra/value"} cfg.Startup.IgnoreWorkloads = []string{"maintenance/metis/extra/value"}
@ -187,6 +229,9 @@ func TestValidateRejectsBadIgnoreWorkloadFormat(t *testing.T) {
} }
} }
// TestValidateRejectsInvalidRequiredNodeLabel runs one orchestration or CLI step.
// Signature: TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) { func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
cfg := defaults() cfg := defaults()
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{ cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
@ -198,3 +243,85 @@ func TestValidateRejectsInvalidRequiredNodeLabel(t *testing.T) {
t.Fatalf("expected validation error for invalid required_node_labels entry") t.Fatalf("expected validation error for invalid required_node_labels entry")
} }
} }
// TestValidateRejectsInvalidNodeInventoryReachWindow runs one orchestration or CLI step.
// Signature: TestValidateRejectsInvalidNodeInventoryReachWindow(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsInvalidNodeInventoryReachWindow(t *testing.T) {
cfg := defaults()
cfg.Startup.NodeInventoryReachWaitSeconds = 0
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for invalid node_inventory_reachability_wait_seconds")
}
}
// TestValidateRejectsMissingReportsDir runs one orchestration or CLI step.
// Signature: TestValidateRejectsMissingReportsDir(t *testing.T).
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func TestValidateRejectsMissingReportsDir(t *testing.T) {
cfg := defaults()
cfg.State.ReportsDir = ""
if err := cfg.Validate(); err == nil {
t.Fatalf("expected validation error for missing state.reports_dir")
}
}
// TestApplyDefaultsMergesServiceChecklistDefaults runs one orchestration or CLI step.
// Signature: TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T).
// Why: host configs may define a partial checklist; startup still needs the
// baseline service validations learned from drills.
func TestApplyDefaultsMergesServiceChecklistDefaults(t *testing.T) {
cfg := Config{
Startup: Startup{
ServiceChecklist: []ServiceChecklistCheck{
{
Name: "custom-smoke",
URL: "https://example.invalid/healthz",
TimeoutSeconds: 7,
},
},
},
}
cfg.applyDefaults()
names := map[string]struct{}{}
for _, check := range cfg.Startup.ServiceChecklist {
names[check.Name] = struct{}{}
}
if _, ok := names["custom-smoke"]; !ok {
t.Fatalf("expected custom checklist entry to be preserved")
}
if _, ok := names["logging-oidc-redirect"]; !ok {
t.Fatalf("expected default logging redirect check to be merged in")
}
if _, ok := names["vaultwarden-ui"]; !ok {
t.Fatalf("expected default vaultwarden check to be merged in")
}
}
// TestApplyDefaultsMergesCriticalServiceEndpointDefaults runs one orchestration or CLI step.
// Signature: TestApplyDefaultsMergesCriticalServiceEndpointDefaults(t *testing.T).
// Why: startup endpoint gating must keep baseline backend checks even when host
// configs only provide a subset.
func TestApplyDefaultsMergesCriticalServiceEndpointDefaults(t *testing.T) {
cfg := Config{
Startup: Startup{
CriticalServiceEndpoints: []string{"customns/customsvc"},
},
}
cfg.applyDefaults()
seen := map[string]struct{}{}
for _, entry := range cfg.Startup.CriticalServiceEndpoints {
seen[entry] = struct{}{}
}
if _, ok := seen["customns/customsvc"]; !ok {
t.Fatalf("expected custom critical endpoint to be preserved")
}
if _, ok := seen["logging/opensearch-dashboards"]; !ok {
t.Fatalf("expected logging/opensearch-dashboards critical endpoint default")
}
if _, ok := seen["monitoring/victoria-metrics-single-server"]; !ok {
t.Fatalf("expected monitoring/victoria-metrics-single-server critical endpoint default")
}
}

155
internal/config/defaults.go Normal file
View File

@ -0,0 +1,155 @@
package config
// defaults runs one orchestration or CLI step.
// Signature: defaults() Config.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
func defaults() Config {
c := Config{
IACRepoPath: "/opt/titan-iac",
ExpectedFluxBranch: "main",
ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
SSHPort: 2277,
ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"},
LocalBootstrapPaths: []string{
"infrastructure/core",
"clusters/atlas/flux-system",
"infrastructure/sources/helm",
"infrastructure/metallb",
"infrastructure/traefik",
"infrastructure/cert-manager",
"infrastructure/vault-csi",
"infrastructure/vault-injector",
"services/vault",
"infrastructure/postgres",
"services/gitea",
"services/keycloak",
"services/oauth2-proxy",
},
ExcludedNamespaces: []string{
"kube-system",
"kube-public",
"kube-node-lease",
"flux-system",
"traefik",
"metallb-system",
"cert-manager",
"longhorn-system",
"vault",
"postgres",
"maintenance",
},
Startup: Startup{
APIWaitSeconds: 1200,
APIPollSeconds: 2,
ShutdownCooldownSeconds: 45,
RequireNodeInventoryReach: true,
NodeInventoryReachWaitSeconds: 300,
NodeInventoryReachPollSeconds: 5,
RequireTimeSync: true,
TimeSyncWaitSeconds: 240,
TimeSyncPollSeconds: 5,
TimeSyncMode: "quorum",
TimeSyncQuorum: 2,
ReconcileAccessOnBoot: true,
AutoEtcdRestoreOnAPIFailure: true,
EtcdRestoreControlPlane: "titan-0a",
RequireStorageReady: true,
StorageReadyWaitSeconds: 420,
StorageReadyPollSeconds: 5,
StorageMinReadyNodes: 2,
StorageCriticalPVCs: []string{
"vault/data-vault-0",
"postgres/postgres-data-postgres-0",
"gitea/gitea-data",
"sso/keycloak-data",
},
MinimumBatteryPercent: 20,
RequiredNodeLabels: map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
},
RequirePostStartProbes: true,
PostStartProbeWaitSeconds: 240,
PostStartProbePollSeconds: 5,
PostStartProbes: []string{
"https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
"https://scm.bstein.dev/api/healthz",
"https://metrics.bstein.dev/api/health",
},
RequireServiceChecklist: true,
ServiceChecklistWaitSeconds: 420,
ServiceChecklistPollSeconds: 5,
ServiceChecklistStabilitySec: 120,
ServiceChecklist: defaultServiceChecklist(),
RequireCriticalServiceEndpoints: true,
CriticalServiceEndpointWaitSec: 420,
CriticalServiceEndpointPollSec: 5,
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
RequireIngressChecklist: true,
IngressChecklistWaitSeconds: 420,
IngressChecklistPollSeconds: 5,
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
IngressChecklistIgnoreHosts: []string{},
RequireNodeSSHAuth: true,
NodeSSHAuthWaitSeconds: 240,
NodeSSHAuthPollSeconds: 5,
RequireFluxHealth: true,
FluxHealthWaitSeconds: 900,
FluxHealthPollSeconds: 5,
IgnoreFluxKustomizations: []string{},
RequireWorkloadConvergence: true,
WorkloadConvergenceWaitSeconds: 900,
WorkloadConvergencePollSeconds: 5,
IgnoreWorkloadNamespaces: []string{},
IgnoreWorkloads: []string{},
IgnoreUnavailableNodes: []string{},
AutoRecycleStuckPods: true,
StuckPodGraceSeconds: 180,
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
VaultUnsealBreakglassTimeout: 15,
},
Shutdown: Shutdown{
DefaultBudgetSeconds: 1380,
HistoryMinSamples: 3,
EmergencyBudgetSec: 420,
EmergencyMinSamples: 3,
EmergencySkipEtcd: true,
EmergencySkipDrain: true,
DrainParallelism: 6,
ScaleParallelism: 8,
SSHParallelism: 8,
},
UPS: UPS{
Enabled: true,
Provider: "nut",
PollSeconds: 5,
RuntimeSafetyFactor: 1.25,
DebounceCount: 3,
TelemetryTimeoutSeconds: 90,
},
Coordination: Coordination{
ForwardShutdownConfig: "/etc/ananke/ananke.yaml",
PeerHosts: []string{},
FallbackLocalShutdown: true,
CommandTimeoutSeconds: 25,
StartupGuardMaxAgeSec: 900,
Role: "coordinator",
AllowStartupOnBattery: false,
},
Metrics: Metrics{
Enabled: true,
BindAddr: "0.0.0.0:9560",
Path: "/metrics",
},
State: State{
Dir: "/var/lib/ananke",
ReportsDir: "/var/lib/ananke/reports",
RunHistoryPath: "/var/lib/ananke/runs.json",
LockPath: "/var/lib/ananke/ananke.lock",
IntentPath: "/var/lib/ananke/intent.json",
},
}
c.applyDefaults()
return c
}

View File

@ -0,0 +1,315 @@
package config
import "strings"
// defaultServiceChecklist runs one orchestration or CLI step.
// Signature: defaultServiceChecklist() []ServiceChecklistCheck.
// Why: startup must verify real external behavior per service (not only generic
// ingress reachability) so false positives do not pass drills.
func defaultServiceChecklist() []ServiceChecklistCheck {
return []ServiceChecklistCheck{
{
Name: "gitea-api",
URL: "https://scm.bstein.dev/api/healthz",
AcceptedStatuses: []int{200},
BodyContains: "pass",
TimeoutSeconds: 12,
},
{
Name: "grafana-api",
URL: "https://metrics.bstein.dev/api/health",
AcceptedStatuses: []int{200},
BodyContains: "\"database\":\"ok\"",
TimeoutSeconds: 12,
},
{
Name: "keycloak-oidc",
URL: "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
AcceptedStatuses: []int{200},
BodyContains: "\"issuer\":\"https://sso.bstein.dev/realms/atlas\"",
TimeoutSeconds: 12,
},
{
Name: "harbor-registry-api",
URL: "https://registry.bstein.dev/v2/",
AcceptedStatuses: []int{401},
BodyContains: "unauthorized",
TimeoutSeconds: 12,
},
{
Name: "alerts-ui",
URL: "https://alerts.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "Alertmanager",
TimeoutSeconds: 12,
},
{
Name: "auth-gateway-redirect",
URL: "https://auth.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "https://sso.bstein.dev/realms/atlas/",
TimeoutSeconds: 12,
},
{
Name: "home-site",
URL: "https://bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "Titan Lab",
TimeoutSeconds: 12,
},
{
Name: "actual-budget-ui",
URL: "https://budget.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "<title>Actual",
TimeoutSeconds: 12,
},
{
Name: "element-call-ui",
URL: "https://call.live.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "Element Call",
TimeoutSeconds: 12,
},
{
Name: "flux-gitops-ui",
URL: "https://cd.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "Weave GitOps",
TimeoutSeconds: 12,
},
{
Name: "chat-ai-health",
URL: "https://chat.ai.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "\"ok\": true",
TimeoutSeconds: 12,
},
{
Name: "jenkins-auth-gate",
URL: "https://ci.bstein.dev/",
AcceptedStatuses: []int{403},
BodyContains: "commenceLogin",
TimeoutSeconds: 12,
},
{
Name: "nextcloud-login-redirect",
URL: "https://cloud.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "/index.php/login",
TimeoutSeconds: 12,
},
{
Name: "wger-redirect",
URL: "https://health.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "/en/",
TimeoutSeconds: 12,
},
{
Name: "livekit-edge",
URL: "https://kit.live.bstein.dev/",
AcceptedStatuses: []int{404},
BodyContains: "404 page not found",
TimeoutSeconds: 12,
},
{
Name: "element-web-ui",
URL: "https://live.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "<title>Element</title>",
TimeoutSeconds: 12,
},
{
Name: "logging-oidc-redirect",
URL: "https://logs.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "client_id=logs",
TimeoutSeconds: 12,
},
{
Name: "longhorn-oidc-redirect",
URL: "https://longhorn.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "https://sso.bstein.dev/realms/atlas/",
TimeoutSeconds: 12,
},
{
Name: "matrix-auth-ui",
URL: "https://matrix.live.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "matrix-authentication-service",
TimeoutSeconds: 12,
},
{
Name: "monero-edge",
URL: "https://monero.bstein.dev/",
AcceptedStatuses: []int{404},
TimeoutSeconds: 12,
},
{
Name: "firefly-login-redirect",
URL: "https://money.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "/login",
TimeoutSeconds: 12,
},
{
Name: "outline-ui",
URL: "https://notes.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "<title>Outline</title>",
TimeoutSeconds: 12,
},
{
Name: "collabora-probe",
URL: "https://office.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "OK",
TimeoutSeconds: 12,
},
{
Name: "pegasus-ui",
URL: "https://pegasus.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "<title>Pegasus</title>",
TimeoutSeconds: 12,
},
{
Name: "harbor-ui",
URL: "https://registry.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "<title>Harbor</title>",
TimeoutSeconds: 12,
},
{
Name: "vault-ui-redirect",
URL: "https://secret.bstein.dev/",
AcceptedStatuses: []int{307},
LocationContains: "/ui/",
TimeoutSeconds: 12,
},
{
Name: "sentinel-oidc-redirect",
URL: "https://sentinel.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "client_id=metis",
TimeoutSeconds: 12,
},
{
Name: "keycloak-admin-redirect",
URL: "https://sso.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "https://sso.bstein.dev/admin/",
TimeoutSeconds: 12,
},
{
Name: "jellyfin-edge",
URL: "https://stream.bstein.dev/",
AcceptedStatuses: []int{302},
LocationContains: "web/",
TimeoutSeconds: 12,
},
{
Name: "planka-ui",
URL: "https://tasks.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "<title>PLANKA</title>",
TimeoutSeconds: 12,
},
{
Name: "vaultwarden-ui",
URL: "https://vault.bstein.dev/",
AcceptedStatuses: []int{200},
BodyContains: "<title>Vaultwarden Web</title>",
TimeoutSeconds: 12,
},
}
}
// defaultCriticalServiceEndpoints runs one orchestration or CLI step.
// Signature: defaultCriticalServiceEndpoints() []string.
// Why: service edge checks are insufficient for protected stacks; endpoint
// presence verifies that backends are actually routable before startup success.
func defaultCriticalServiceEndpoints() []string {
return []string{
"monitoring/victoria-metrics-single-server",
"monitoring/grafana",
"monitoring/kube-state-metrics",
"logging/oauth2-proxy-logs",
"logging/opensearch-dashboards",
"logging/opensearch-master",
}
}
// mergeServiceChecklistDefaults runs one orchestration or CLI step.
// Signature: mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck.
// Why: host configs can keep custom checks while still inheriting mandatory
// baseline checks introduced after incident learnings.
func mergeServiceChecklistDefaults(existing, defaults []ServiceChecklistCheck) []ServiceChecklistCheck {
if len(existing) == 0 {
out := make([]ServiceChecklistCheck, 0, len(defaults))
out = append(out, defaults...)
return out
}
byName := map[string]struct{}{}
for _, check := range existing {
name := strings.TrimSpace(check.Name)
if name == "" {
continue
}
byName[name] = struct{}{}
}
out := make([]ServiceChecklistCheck, 0, len(existing)+len(defaults))
out = append(out, existing...)
for _, check := range defaults {
name := strings.TrimSpace(check.Name)
if name == "" {
continue
}
if _, exists := byName[name]; exists {
continue
}
out = append(out, check)
}
return out
}
// mergeStringDefaults runs one orchestration or CLI step.
// Signature: mergeStringDefaults(existing, defaults []string) []string.
// Why: keeps baseline startup guards applied while preserving site-specific
// additions already declared in host configs.
func mergeStringDefaults(existing, defaults []string) []string {
if len(existing) == 0 {
out := make([]string, 0, len(defaults))
out = append(out, defaults...)
return out
}
seen := map[string]struct{}{}
out := make([]string, 0, len(existing)+len(defaults))
for _, item := range existing {
key := strings.TrimSpace(item)
if key == "" {
continue
}
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
out = append(out, key)
}
for _, item := range defaults {
key := strings.TrimSpace(item)
if key == "" {
continue
}
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
out = append(out, key)
}
return out
}

156
internal/config/types.go Normal file
View File

@ -0,0 +1,156 @@
package config
type Config struct {
Kubeconfig string `yaml:"kubeconfig"`
SSHUser string `yaml:"ssh_user"`
SSHPort int `yaml:"ssh_port"`
SSHConfigFile string `yaml:"ssh_config_file"`
SSHIdentityFile string `yaml:"ssh_identity_file"`
SSHNodeHosts map[string]string `yaml:"ssh_node_hosts"`
SSHNodeUsers map[string]string `yaml:"ssh_node_users"`
SSHManagedNodes []string `yaml:"ssh_managed_nodes"`
SSHJumpHost string `yaml:"ssh_jump_host"`
SSHJumpUser string `yaml:"ssh_jump_user"`
IACRepoPath string `yaml:"iac_repo_path"`
ExpectedFluxBranch string `yaml:"expected_flux_branch"`
ExpectedFluxSource string `yaml:"expected_flux_source_url"`
ControlPlanes []string `yaml:"control_planes"`
Workers []string `yaml:"workers"`
LocalBootstrapPaths []string `yaml:"local_bootstrap_paths"`
ExcludedNamespaces []string `yaml:"excluded_namespaces"`
Startup Startup `yaml:"startup"`
Shutdown Shutdown `yaml:"shutdown"`
UPS UPS `yaml:"ups"`
Coordination Coordination `yaml:"coordination"`
Metrics Metrics `yaml:"metrics"`
State State `yaml:"state"`
}
type Startup struct {
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
TimeSyncMode string `yaml:"time_sync_mode"`
TimeSyncQuorum int `yaml:"time_sync_quorum"`
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
RequireStorageReady bool `yaml:"require_storage_ready"`
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
PostStartProbes []string `yaml:"post_start_probes"`
RequireServiceChecklist bool `yaml:"require_service_checklist"`
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
RequireFluxHealth bool `yaml:"require_flux_health"`
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
IgnoreWorkloads []string `yaml:"ignore_workloads"`
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
}
type ServiceChecklistCheck struct {
Name string `yaml:"name"`
URL string `yaml:"url"`
AcceptedStatuses []int `yaml:"accepted_statuses"`
LocationContains string `yaml:"location_contains"`
LocationNotContains string `yaml:"location_not_contains"`
BodyContains string `yaml:"body_contains"`
BodyNotContains string `yaml:"body_not_contains"`
TimeoutSeconds int `yaml:"timeout_seconds"`
InsecureSkipTLS bool `yaml:"insecure_skip_tls"`
}
type Shutdown struct {
DefaultBudgetSeconds int `yaml:"default_budget_seconds"`
HistoryMinSamples int `yaml:"history_min_samples"`
EmergencyBudgetSec int `yaml:"emergency_budget_seconds"`
EmergencyMinSamples int `yaml:"emergency_history_min_samples"`
EmergencySkipEtcd bool `yaml:"emergency_skip_etcd_snapshot"`
EmergencySkipDrain bool `yaml:"emergency_skip_drain"`
SkipEtcdSnapshot bool `yaml:"skip_etcd_snapshot"`
SkipDrain bool `yaml:"skip_drain"`
DrainParallelism int `yaml:"drain_parallelism"`
ScaleParallelism int `yaml:"scale_parallelism"`
SSHParallelism int `yaml:"ssh_parallelism"`
}
type UPS struct {
Enabled bool `yaml:"enabled"`
Provider string `yaml:"provider"`
Target string `yaml:"target"`
Targets []UPSTarget `yaml:"targets"`
PollSeconds int `yaml:"poll_seconds"`
RuntimeSafetyFactor float64 `yaml:"runtime_safety_factor"`
DebounceCount int `yaml:"debounce_count"`
TelemetryTimeoutSeconds int `yaml:"telemetry_timeout_seconds"`
}
type UPSTarget struct {
Name string `yaml:"name"`
Target string `yaml:"target"`
}
type Coordination struct {
ForwardShutdownHost string `yaml:"forward_shutdown_host"`
ForwardShutdownUser string `yaml:"forward_shutdown_user"`
ForwardShutdownConfig string `yaml:"forward_shutdown_config"`
PeerHosts []string `yaml:"peer_hosts"`
FallbackLocalShutdown bool `yaml:"fallback_local_shutdown"`
CommandTimeoutSeconds int `yaml:"command_timeout_seconds"`
StartupGuardMaxAgeSec int `yaml:"startup_guard_max_age_seconds"`
Role string `yaml:"role"`
AllowStartupOnBattery bool `yaml:"allow_startup_on_battery"`
}
type Metrics struct {
Enabled bool `yaml:"enabled"`
BindAddr string `yaml:"bind_addr"`
Path string `yaml:"path"`
}
type State struct {
Dir string `yaml:"dir"`
ReportsDir string `yaml:"reports_dir"`
RunHistoryPath string `yaml:"run_history_path"`
LockPath string `yaml:"lock_path"`
IntentPath string `yaml:"intent_path"`
}