From 1f656de5df82ae43b1184203bede9ea9786472d6 Mon Sep 17 00:00:00 2001 From: codex Date: Tue, 5 May 2026 05:17:59 -0300 Subject: [PATCH] startup(ananke): scope emergency recovery to core services --- README.md | 14 +- configs/ananke.example.yaml | 5 + configs/ananke.tethys.yaml | 31 +++ configs/ananke.titan-db.yaml | 31 +++ .../cluster/orchestrator_access_fluxsource.go | 2 +- .../cluster/orchestrator_critical_vault.go | 25 ++ internal/cluster/orchestrator_fluxhealth.go | 23 ++ internal/cluster/orchestrator_ingress.go | 10 + internal/cluster/orchestrator_lifecycle.go | 29 +-- .../cluster/orchestrator_node_reachability.go | 2 +- .../cluster/orchestrator_shutdown_mode.go | 21 ++ .../cluster/orchestrator_startup_scope.go | 81 +++++++ .../cluster/orchestrator_startup_vault.go | 52 ++++ .../orchestrator_workload_convergence.go | 6 + .../cluster/orchestrator_workload_ignore.go | 6 + internal/cluster/testing_hooks_startup.go | 55 +++++ internal/config/apply_defaults.go | 18 +- internal/config/defaults.go | 94 ++++---- internal/config/load_additional_test.go | 38 +++ internal/config/types.go | 123 +++++----- internal/config/validate.go | 51 ++++ internal/config/validate_matrix_test.go | 17 ++ .../hooks_flux_workload_matrix_test.go | 59 +++++ .../hooks_ingress_service_matrix_test.go | 42 ++++ ...ks_lifecycle_cleanup_branch_matrix_test.go | 2 +- .../hooks_startup_scope_vault_test.go | 222 ++++++++++++++++++ ...oks_workload_storage_access_matrix_test.go | 26 ++ 27 files changed, 946 insertions(+), 139 deletions(-) create mode 100644 internal/cluster/orchestrator_shutdown_mode.go create mode 100644 internal/cluster/orchestrator_startup_scope.go create mode 100644 internal/cluster/orchestrator_startup_vault.go create mode 100644 internal/cluster/testing_hooks_startup.go create mode 100644 testing/orchestrator/hooks_startup_scope_vault_test.go diff --git a/README.md b/README.md index f8f4393..0b9673c 100644 --- a/README.md +++ b/README.md @@ -97,10 +97,15 @@ Primary config path: Keep these fields accurate: - `expected_flux_source_url` - `expected_flux_branch` +- `startup.service_checklist_explicit_only` - `startup.service_checklist` - `startup.critical_service_endpoints` - `startup.require_ingress_checklist` - `startup.require_node_inventory_reachability` +- `startup.node_inventory_reachability_required_nodes` +- `startup.node_ssh_auth_required_nodes` +- `startup.flux_health_required_kustomizations` +- `startup.workload_convergence_required_namespaces` - `startup.ignore_unavailable_nodes` - `coordination.role` - `coordination.peer_hosts` @@ -134,9 +139,10 @@ Installer behavior: When adding nodes or services: 1. Update inventory and node mapping in config. -2. Add/adjust service checklist entries for anything user-facing or critical. -3. Add/adjust ingress expectations for exposed services. -4. Use temporary ignores only when truly intentional, then remove them. -5. Run `scripts/quality_gate.sh` before host deployment. +2. Keep the explicit service checklist focused on the core services that must come back during an outage. +3. Keep `*_required_*` startup scopes aligned with the same core set so optional stacks do not block bootstrap. +4. Add/adjust ingress expectations for exposed services. +5. Use temporary ignores only when truly intentional, then remove them. +6. Run `scripts/quality_gate.sh` before host deployment. Recovery quality should improve over time: every drill should reduce manual work in the next drill. diff --git a/configs/ananke.example.yaml b/configs/ananke.example.yaml index f18aeb8..e12a1bf 100644 --- a/configs/ananke.example.yaml +++ b/configs/ananke.example.yaml @@ -51,6 +51,7 @@ startup: require_node_inventory_reachability: true node_inventory_reachability_wait_seconds: 300 node_inventory_reachability_poll_seconds: 5 + node_inventory_reachability_required_nodes: [] required_node_labels: titan-09: ananke.bstein.dev/harbor-bootstrap: "true" @@ -90,6 +91,7 @@ startup: admin_secret_name: keycloak-admin admin_secret_username_key: username admin_secret_password_key: password + service_checklist_explicit_only: false service_checklist: - name: gitea-api url: https://scm.bstein.dev/api/healthz @@ -134,13 +136,16 @@ startup: require_node_ssh_auth: true node_ssh_auth_wait_seconds: 240 node_ssh_auth_poll_seconds: 5 + node_ssh_auth_required_nodes: [] require_flux_health: true flux_health_wait_seconds: 900 flux_health_poll_seconds: 5 + flux_health_required_kustomizations: [] ignore_flux_kustomizations: [] require_workload_convergence: true workload_convergence_wait_seconds: 900 workload_convergence_poll_seconds: 5 + workload_convergence_required_namespaces: [] ignore_workload_namespaces: [] ignore_workloads: [] ignore_unavailable_nodes: [] diff --git a/configs/ananke.tethys.yaml b/configs/ananke.tethys.yaml index 378ee61..e8dfcba 100644 --- a/configs/ananke.tethys.yaml +++ b/configs/ananke.tethys.yaml @@ -117,6 +117,10 @@ startup: require_node_inventory_reachability: true node_inventory_reachability_wait_seconds: 300 node_inventory_reachability_poll_seconds: 5 + node_inventory_reachability_required_nodes: + - titan-0a + - titan-0b + - titan-0c required_node_labels: titan-09: ananke.bstein.dev/harbor-bootstrap: "true" @@ -156,6 +160,7 @@ startup: admin_secret_name: keycloak-admin admin_secret_username_key: username admin_secret_password_key: password + service_checklist_explicit_only: true service_checklist: - name: gitea-api url: https://scm.bstein.dev/api/healthz @@ -200,13 +205,39 @@ startup: require_node_ssh_auth: true node_ssh_auth_wait_seconds: 240 node_ssh_auth_poll_seconds: 5 + node_ssh_auth_required_nodes: + - titan-0a + - titan-0b + - titan-0c require_flux_health: true flux_health_wait_seconds: 900 flux_health_poll_seconds: 5 + flux_health_required_kustomizations: + - flux-system/core + - flux-system/helm + - flux-system/traefik + - flux-system/cert-manager + - flux-system/longhorn + - flux-system/vault-csi + - flux-system/vault-injector + - flux-system/postgres + - flux-system/vault + - flux-system/keycloak + - flux-system/oauth2-proxy + - flux-system/gitea + - flux-system/monitoring + - flux-system/harbor ignore_flux_kustomizations: [] require_workload_convergence: true workload_convergence_wait_seconds: 900 workload_convergence_poll_seconds: 5 + workload_convergence_required_namespaces: + - vault + - postgres + - sso + - gitea + - monitoring + - harbor ignore_workload_namespaces: [] ignore_workloads: [] ignore_unavailable_nodes: [] diff --git a/configs/ananke.titan-db.yaml b/configs/ananke.titan-db.yaml index d59a6b6..680c7ac 100644 --- a/configs/ananke.titan-db.yaml +++ b/configs/ananke.titan-db.yaml @@ -117,6 +117,10 @@ startup: require_node_inventory_reachability: true node_inventory_reachability_wait_seconds: 300 node_inventory_reachability_poll_seconds: 5 + node_inventory_reachability_required_nodes: + - titan-0a + - titan-0b + - titan-0c required_node_labels: titan-09: ananke.bstein.dev/harbor-bootstrap: "true" @@ -156,6 +160,7 @@ startup: admin_secret_name: keycloak-admin admin_secret_username_key: username admin_secret_password_key: password + service_checklist_explicit_only: true service_checklist: - name: gitea-api url: https://scm.bstein.dev/api/healthz @@ -200,13 +205,39 @@ startup: require_node_ssh_auth: true node_ssh_auth_wait_seconds: 240 node_ssh_auth_poll_seconds: 5 + node_ssh_auth_required_nodes: + - titan-0a + - titan-0b + - titan-0c require_flux_health: true flux_health_wait_seconds: 900 flux_health_poll_seconds: 5 + flux_health_required_kustomizations: + - flux-system/core + - flux-system/helm + - flux-system/traefik + - flux-system/cert-manager + - flux-system/longhorn + - flux-system/vault-csi + - flux-system/vault-injector + - flux-system/postgres + - flux-system/vault + - flux-system/keycloak + - flux-system/oauth2-proxy + - flux-system/gitea + - flux-system/monitoring + - flux-system/harbor ignore_flux_kustomizations: [] require_workload_convergence: true workload_convergence_wait_seconds: 900 workload_convergence_poll_seconds: 5 + workload_convergence_required_namespaces: + - vault + - postgres + - sso + - gitea + - monitoring + - harbor ignore_workload_namespaces: [] ignore_workloads: [] ignore_unavailable_nodes: [] diff --git a/internal/cluster/orchestrator_access_fluxsource.go b/internal/cluster/orchestrator_access_fluxsource.go index 22c1947..529a875 100644 --- a/internal/cluster/orchestrator_access_fluxsource.go +++ b/internal/cluster/orchestrator_access_fluxsource.go @@ -77,7 +77,7 @@ func (o *Orchestrator) waitForNodeSSHAuth(ctx context.Context, nodes []string) e ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) seen := map[string]struct{}{} targets := make([]string, 0, len(nodes)) - for _, node := range nodes { + for _, node := range startupRequiredNodes(nodes, o.cfg.Startup.NodeSSHAuthRequiredNodes) { node = strings.TrimSpace(node) if node == "" { continue diff --git a/internal/cluster/orchestrator_critical_vault.go b/internal/cluster/orchestrator_critical_vault.go index 907c787..c93136e 100644 --- a/internal/cluster/orchestrator_critical_vault.go +++ b/internal/cluster/orchestrator_critical_vault.go @@ -227,6 +227,31 @@ func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) er return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name) } +// ensureVaultUnsealedWhenRunnable runs one orchestration or CLI step. +// Signature: (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error). +// Why: lets startup defer vault unseal until the pod is actually runnable, while +// keeping the direct unseal helper strict for explicit recovery paths and tests. +func (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error) { + if o.runner.DryRun { + return false, "", nil + } + + phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}") + if err != nil { + if isNotFoundErr(err) { + return true, "vault-0 pod is not present yet; deferring unseal until critical workload recovery", nil + } + return false, "", fmt.Errorf("vault pod phase check failed: %w", err) + } + + trimmedPhase := strings.TrimSpace(phase) + if trimmedPhase != "Running" { + return true, fmt.Sprintf("vault-0 pod phase is %q; deferring unseal until critical workload recovery", trimmedPhase), nil + } + + return false, "", o.ensureVaultUnsealed(ctx) +} + // ensureVaultUnsealed runs one orchestration or CLI step. // Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. diff --git a/internal/cluster/orchestrator_fluxhealth.go b/internal/cluster/orchestrator_fluxhealth.go index b180056..2574e7f 100644 --- a/internal/cluster/orchestrator_fluxhealth.go +++ b/internal/cluster/orchestrator_fluxhealth.go @@ -143,6 +143,8 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error return false, "", fmt.Errorf("decode flux kustomizations: %w", err) } ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations) + required := o.startupRequiredFluxKustomizations() + requiredSeen := map[string]struct{}{} notReady := []string{} for _, ks := range list.Items { ns := strings.TrimSpace(ks.Metadata.Namespace) @@ -154,6 +156,12 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error if ks.Spec.Suspend { continue } + if len(required) > 0 { + if _, ok := required[full]; !ok { + continue + } + requiredSeen[full] = struct{}{} + } if _, ok := ignored[full]; ok { continue } @@ -173,10 +181,25 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error } notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason)) } + if len(required) > 0 { + missing := []string{} + for full := range required { + if _, ok := requiredSeen[full]; !ok { + missing = append(missing, full+"(missing)") + } + } + if len(missing) > 0 { + sort.Strings(missing) + notReady = append(notReady, missing...) + } + } if len(notReady) > 0 { sort.Strings(notReady) return false, "not ready: " + joinLimited(notReady, 6), nil } + if len(required) > 0 { + return true, fmt.Sprintf("required kustomizations ready=%d", len(requiredSeen)), nil + } return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil } diff --git a/internal/cluster/orchestrator_ingress.go b/internal/cluster/orchestrator_ingress.go index beae3c1..134c383 100644 --- a/internal/cluster/orchestrator_ingress.go +++ b/internal/cluster/orchestrator_ingress.go @@ -19,6 +19,7 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error { if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 { return nil } + ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels)) for node := range o.cfg.Startup.RequiredNodeLabels { node = strings.TrimSpace(node) @@ -28,6 +29,10 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error { } sort.Strings(nodes) for _, node := range nodes { + if _, skip := ignored[node]; skip { + o.log.Printf("skipping required node labels for ignored unavailable node %s", node) + continue + } labels := o.cfg.Startup.RequiredNodeLabels[node] if len(labels) == 0 { continue @@ -55,6 +60,11 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error { continue } if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil { + if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) { + o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err) + o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node)) + continue + } return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err) } o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", ")) diff --git a/internal/cluster/orchestrator_lifecycle.go b/internal/cluster/orchestrator_lifecycle.go index acfa84d..5e849be 100644 --- a/internal/cluster/orchestrator_lifecycle.go +++ b/internal/cluster/orchestrator_lifecycle.go @@ -37,14 +37,7 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er return invErr } o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed") - if err := o.waitForAPI(ctx, 1, time.Second); err == nil { - o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed while kubernetes api is already available") - if err := o.ensureVaultUnsealed(ctx); err != nil { - o.noteStartupCheck("vault-unseal", false, err.Error()) - return err - } - o.noteStartupCheck("vault-unseal", true, "vault is unsealed") - } + o.maybeRunEarlyVaultUnseal(ctx) o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory") if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil { o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error()) @@ -187,12 +180,9 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er } } o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable") - o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates") - if err := o.ensureVaultUnsealed(ctx); err != nil { - o.noteStartupCheck("vault-unseal", false, err.Error()) + if err := o.runStartupVaultUnsealGate(ctx); err != nil { return err } - o.noteStartupCheck("vault-unseal", true, "vault is unsealed") if err := o.ensureRequiredNodeLabels(ctx); err != nil { return err } @@ -490,18 +480,3 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err o.log.Printf("shutdown flow complete") return nil } - -// normalizeShutdownMode runs one orchestration or CLI step. -// Signature: normalizeShutdownMode(raw string) (string, error). -// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only -// semantics while preserving compatibility with legacy "config" callers. -func normalizeShutdownMode(raw string) (string, error) { - switch strings.TrimSpace(raw) { - case "", "config", "cluster-only": - return "cluster-only", nil - case "poweroff": - return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw) - default: - return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw) - } -} diff --git a/internal/cluster/orchestrator_node_reachability.go b/internal/cluster/orchestrator_node_reachability.go index 9209f75..275cc4e 100644 --- a/internal/cluster/orchestrator_node_reachability.go +++ b/internal/cluster/orchestrator_node_reachability.go @@ -30,7 +30,7 @@ func (o *Orchestrator) waitForNodeInventoryReachability(ctx context.Context) err ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) targets := make([]string, 0, len(o.inventoryNodesForValidation())) seen := map[string]struct{}{} - for _, node := range o.inventoryNodesForValidation() { + for _, node := range startupRequiredNodes(o.inventoryNodesForValidation(), o.cfg.Startup.NodeInventoryReachRequiredNodes) { node = strings.TrimSpace(node) if node == "" { continue diff --git a/internal/cluster/orchestrator_shutdown_mode.go b/internal/cluster/orchestrator_shutdown_mode.go new file mode 100644 index 0000000..9ab9f69 --- /dev/null +++ b/internal/cluster/orchestrator_shutdown_mode.go @@ -0,0 +1,21 @@ +package cluster + +import ( + "fmt" + "strings" +) + +// normalizeShutdownMode runs one orchestration or CLI step. +// Signature: normalizeShutdownMode(raw string) (string, error). +// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only +// semantics while preserving compatibility with legacy "config" callers. +func normalizeShutdownMode(raw string) (string, error) { + switch strings.TrimSpace(raw) { + case "", "config", "cluster-only": + return "cluster-only", nil + case "poweroff": + return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw) + default: + return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw) + } +} diff --git a/internal/cluster/orchestrator_startup_scope.go b/internal/cluster/orchestrator_startup_scope.go new file mode 100644 index 0000000..7b6c515 --- /dev/null +++ b/internal/cluster/orchestrator_startup_scope.go @@ -0,0 +1,81 @@ +package cluster + +import "strings" + +// startupRequiredNodes runs one orchestration or CLI step. +// Signature: startupRequiredNodes(nodes []string, required []string) []string. +// Why: lets startup enforce a smaller core node set during outage recovery +// without losing the stricter all-nodes behavior when no override is configured. +func startupRequiredNodes(nodes []string, required []string) []string { + requiredSet := makeStringSet(required) + if len(requiredSet) == 0 { + return nodes + } + filtered := make([]string, 0, len(nodes)) + for _, node := range nodes { + node = strings.TrimSpace(node) + if node == "" { + continue + } + if _, ok := requiredSet[node]; ok { + filtered = append(filtered, node) + } + } + return filtered +} + +// startupNodeStrictlyRequired runs one orchestration or CLI step. +// Signature: (o *Orchestrator) startupNodeStrictlyRequired(node string) bool. +// Why: absent or broken non-core nodes should not block recovery-only actions +// like label reconciliation once the operator has narrowed startup to core nodes. +func (o *Orchestrator) startupNodeStrictlyRequired(node string) bool { + node = strings.TrimSpace(node) + if node == "" { + return false + } + if len(o.cfg.Startup.NodeInventoryReachRequiredNodes) == 0 && len(o.cfg.Startup.NodeSSHAuthRequiredNodes) == 0 { + return true + } + for _, controlPlane := range o.cfg.ControlPlanes { + if strings.TrimSpace(controlPlane) == node { + return true + } + } + if containsNode(o.cfg.Startup.NodeInventoryReachRequiredNodes, node) { + return true + } + return containsNode(o.cfg.Startup.NodeSSHAuthRequiredNodes, node) +} + +// startupRequiredFluxKustomizations runs one orchestration or CLI step. +// Signature: (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{}. +// Why: lets outage recovery wait on a declared core GitOps slice while leaving +// optional stacks free to converge after bootstrap succeeds. +func (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{} { + return makeStringSet(o.cfg.Startup.FluxHealthRequiredKustomizations) +} + +// startupRequiredWorkloadNamespaces runs one orchestration or CLI step. +// Signature: (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{}. +// Why: keeps workload readiness scoped to core namespaces during recovery while +// preserving broad convergence checks when no explicit core list is configured. +func (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{} { + return makeStringSet(o.cfg.Startup.WorkloadConvergenceRequiredNamespaces) +} + +// containsNode runs one orchestration or CLI step. +// Signature: containsNode(entries []string, needle string) bool. +// Why: keeps node-scope checks small and explicit anywhere startup narrows its +// recovery gates to a declared core set. +func containsNode(entries []string, needle string) bool { + needle = strings.TrimSpace(needle) + if needle == "" { + return false + } + for _, entry := range entries { + if strings.TrimSpace(entry) == needle { + return true + } + } + return false +} diff --git a/internal/cluster/orchestrator_startup_vault.go b/internal/cluster/orchestrator_startup_vault.go new file mode 100644 index 0000000..0a671c1 --- /dev/null +++ b/internal/cluster/orchestrator_startup_vault.go @@ -0,0 +1,52 @@ +package cluster + +import ( + "context" + "fmt" + "time" +) + +// maybeRunEarlyVaultUnseal runs one orchestration or CLI step. +// Signature: (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context). +// Why: gives startup a best-effort Vault recovery path when the API is already +// live, without consuming the hard startup failure path before workloads recover. +func (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context) { + if err := o.waitForAPI(ctx, 1, time.Second); err != nil { + return + } + + o.noteStartupCheckState("vault-unseal-early", "running", "best-effort early vault unseal while kubernetes api is already available") + deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx) + if err != nil { + o.log.Printf("warning: early vault unseal deferred: %v", err) + o.noteStartupAutoHeal(fmt.Sprintf("deferred early vault unseal: %v", err)) + return + } + if deferred { + o.log.Printf("vault early unseal deferred: %s", detail) + o.noteStartupAutoHeal(detail) + return + } + o.noteStartupCheck("vault-unseal-early", true, "vault is already unsealed") +} + +// runStartupVaultUnsealGate runs one orchestration or CLI step. +// Signature: (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error. +// Why: keeps the top-level startup flow readable while allowing Vault unseal to +// defer cleanly until critical workload recovery when the pod is not runnable yet. +func (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error { + o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates") + deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx) + if err != nil { + o.noteStartupCheck("vault-unseal", false, err.Error()) + return err + } + if deferred { + o.log.Printf("vault unseal deferred until workload recovery: %s", detail) + o.noteStartupAutoHeal(detail) + o.noteStartupCheck("vault-unseal", true, detail) + return nil + } + o.noteStartupCheck("vault-unseal", true, "vault is unsealed") + return nil +} diff --git a/internal/cluster/orchestrator_workload_convergence.go b/internal/cluster/orchestrator_workload_convergence.go index a9bd35f..179b0ca 100644 --- a/internal/cluster/orchestrator_workload_convergence.go +++ b/internal/cluster/orchestrator_workload_convergence.go @@ -71,6 +71,7 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri if err := json.Unmarshal([]byte(out), &list); err != nil { return false, "", fmt.Errorf("decode controllers: %w", err) } + requiredNamespaces := o.startupRequiredWorkloadNamespaces() ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces) ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads) @@ -84,6 +85,11 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri if kind == "" || ns == "" || name == "" { continue } + if len(requiredNamespaces) > 0 { + if _, ok := requiredNamespaces[ns]; !ok { + continue + } + } if _, ok := ignoredNamespaces[ns]; ok { continue } diff --git a/internal/cluster/orchestrator_workload_ignore.go b/internal/cluster/orchestrator_workload_ignore.go index 6405731..c286186 100644 --- a/internal/cluster/orchestrator_workload_ignore.go +++ b/internal/cluster/orchestrator_workload_ignore.go @@ -116,6 +116,7 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error) return nil, fmt.Errorf("decode pods: %w", err) } + requiredNamespaces := o.startupRequiredWorkloadNamespaces() ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces) ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) stuckReasons := map[string]struct{}{ @@ -138,6 +139,11 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error) if ns == "" || name == "" { continue } + if len(requiredNamespaces) > 0 { + if _, ok := requiredNamespaces[ns]; !ok { + continue + } + } if _, ok := ignoredNamespaces[ns]; ok { continue } diff --git a/internal/cluster/testing_hooks_startup.go b/internal/cluster/testing_hooks_startup.go new file mode 100644 index 0000000..ee32165 --- /dev/null +++ b/internal/cluster/testing_hooks_startup.go @@ -0,0 +1,55 @@ +package cluster + +import "context" + +// TestHookStartupRequiredNodes runs one orchestration or CLI step. +// Signature: TestHookStartupRequiredNodes(nodes []string, required []string) []string. +// Why: exposes recovery-scope node filtering so top-level tests can cover core-only startup narrowing. +func TestHookStartupRequiredNodes(nodes []string, required []string) []string { + return startupRequiredNodes(nodes, required) +} + +// TestHookContainsNode runs one orchestration or CLI step. +// Signature: TestHookContainsNode(entries []string, needle string) bool. +// Why: exposes the small startup-scope membership helper to top-level tests. +func TestHookContainsNode(entries []string, needle string) bool { + return containsNode(entries, needle) +} + +// TestHookStartupNodeStrictlyRequired runs one orchestration or CLI step. +// Signature: (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool. +// Why: exposes strict-node startup scoping so outage-recovery tests can confirm +// non-core nodes stop blocking bootstrap. +func (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool { + return o.startupNodeStrictlyRequired(node) +} + +// TestHookStartupRequiredFluxKustomizations runs one orchestration or CLI step. +// Signature: (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{}. +// Why: exposes flux startup scoping so top-level tests can confirm only core +// kustomizations block emergency bootstrap. +func (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{} { + return o.startupRequiredFluxKustomizations() +} + +// TestHookStartupRequiredWorkloadNamespaces runs one orchestration or CLI step. +// Signature: (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{}. +// Why: exposes workload namespace startup scoping so top-level tests can +// confirm only core workloads block emergency bootstrap. +func (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{} { + return o.startupRequiredWorkloadNamespaces() +} + +// TestHookMaybeRunEarlyVaultUnseal runs one orchestration or CLI step. +// Signature: (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context). +// Why: exposes the early startup Vault deferral helper to top-level tests. +func (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context) { + o.maybeRunEarlyVaultUnseal(ctx) +} + +// TestHookRunStartupVaultUnsealGate runs one orchestration or CLI step. +// Signature: (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error. +// Why: exposes the startup Vault gate helper to top-level tests. +func (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error { + return o.runStartupVaultUnsealGate(ctx) +} diff --git a/internal/config/apply_defaults.go b/internal/config/apply_defaults.go index 88ebf3b..7363fb6 100644 --- a/internal/config/apply_defaults.go +++ b/internal/config/apply_defaults.go @@ -33,6 +33,9 @@ func (c *Config) applyDefaults() { if c.Startup.NodeInventoryReachPollSeconds <= 0 { c.Startup.NodeInventoryReachPollSeconds = 5 } + if c.Startup.NodeInventoryReachRequiredNodes == nil { + c.Startup.NodeInventoryReachRequiredNodes = []string{} + } if c.Startup.RequiredNodeLabels == nil { c.Startup.RequiredNodeLabels = map[string]map[string]string{ "titan-09": { @@ -121,7 +124,11 @@ func (c *Config) applyDefaults() { if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" { c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password" } - c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist()) + if c.Startup.ServiceChecklistExplicitOnly { + c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, []ServiceChecklistCheck{}) + } else { + c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist()) + } for i := range c.Startup.ServiceChecklist { if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 { c.Startup.ServiceChecklist[i].TimeoutSeconds = 12 @@ -152,12 +159,18 @@ func (c *Config) applyDefaults() { if c.Startup.NodeSSHAuthPollSeconds <= 0 { c.Startup.NodeSSHAuthPollSeconds = 5 } + if c.Startup.NodeSSHAuthRequiredNodes == nil { + c.Startup.NodeSSHAuthRequiredNodes = []string{} + } if c.Startup.FluxHealthWaitSeconds <= 0 { c.Startup.FluxHealthWaitSeconds = 900 } if c.Startup.FluxHealthPollSeconds <= 0 { c.Startup.FluxHealthPollSeconds = 5 } + if c.Startup.FluxHealthRequiredKustomizations == nil { + c.Startup.FluxHealthRequiredKustomizations = []string{} + } if c.Startup.IgnoreFluxKustomizations == nil { c.Startup.IgnoreFluxKustomizations = []string{} } @@ -167,6 +180,9 @@ func (c *Config) applyDefaults() { if c.Startup.WorkloadConvergencePollSeconds <= 0 { c.Startup.WorkloadConvergencePollSeconds = 5 } + if c.Startup.WorkloadConvergenceRequiredNamespaces == nil { + c.Startup.WorkloadConvergenceRequiredNamespaces = []string{} + } if c.Startup.IgnoreWorkloadNamespaces == nil { c.Startup.IgnoreWorkloadNamespaces = []string{} } diff --git a/internal/config/defaults.go b/internal/config/defaults.go index b1bcfa4..2d33a39 100644 --- a/internal/config/defaults.go +++ b/internal/config/defaults.go @@ -39,24 +39,25 @@ func defaults() Config { "maintenance", }, Startup: Startup{ - APIWaitSeconds: 1200, - APIPollSeconds: 2, - ShutdownCooldownSeconds: 45, - RequireNodeInventoryReach: true, - NodeInventoryReachWaitSeconds: 300, - NodeInventoryReachPollSeconds: 5, - RequireTimeSync: true, - TimeSyncWaitSeconds: 240, - TimeSyncPollSeconds: 5, - TimeSyncMode: "quorum", - TimeSyncQuorum: 2, - ReconcileAccessOnBoot: true, - AutoEtcdRestoreOnAPIFailure: true, - EtcdRestoreControlPlane: "titan-0a", - RequireStorageReady: true, - StorageReadyWaitSeconds: 420, - StorageReadyPollSeconds: 5, - StorageMinReadyNodes: 2, + APIWaitSeconds: 1200, + APIPollSeconds: 2, + ShutdownCooldownSeconds: 45, + RequireNodeInventoryReach: true, + NodeInventoryReachWaitSeconds: 300, + NodeInventoryReachPollSeconds: 5, + NodeInventoryReachRequiredNodes: []string{}, + RequireTimeSync: true, + TimeSyncWaitSeconds: 240, + TimeSyncPollSeconds: 5, + TimeSyncMode: "quorum", + TimeSyncQuorum: 2, + ReconcileAccessOnBoot: true, + AutoEtcdRestoreOnAPIFailure: true, + EtcdRestoreControlPlane: "titan-0a", + RequireStorageReady: true, + StorageReadyWaitSeconds: 420, + StorageReadyPollSeconds: 5, + StorageMinReadyNodes: 2, StorageCriticalPVCs: []string{ "vault/data-vault-0", "postgres/postgres-data-postgres-0", @@ -91,33 +92,36 @@ func defaults() Config { AdminSecretUsernameKey: "username", AdminSecretPasswordKey: "password", }, - ServiceChecklist: defaultServiceChecklist(), - RequireCriticalServiceEndpoints: true, - CriticalServiceEndpointWaitSec: 420, - CriticalServiceEndpointPollSec: 5, - CriticalServiceEndpoints: defaultCriticalServiceEndpoints(), - RequireIngressChecklist: true, - IngressChecklistWaitSeconds: 420, - IngressChecklistPollSeconds: 5, - IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404}, - IngressChecklistIgnoreHosts: []string{}, - RequireNodeSSHAuth: true, - NodeSSHAuthWaitSeconds: 240, - NodeSSHAuthPollSeconds: 5, - RequireFluxHealth: true, - FluxHealthWaitSeconds: 900, - FluxHealthPollSeconds: 5, - IgnoreFluxKustomizations: []string{}, - RequireWorkloadConvergence: true, - WorkloadConvergenceWaitSeconds: 900, - WorkloadConvergencePollSeconds: 5, - IgnoreWorkloadNamespaces: []string{}, - IgnoreWorkloads: []string{}, - IgnoreUnavailableNodes: []string{}, - AutoRecycleStuckPods: true, - StuckPodGraceSeconds: 180, - VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key", - VaultUnsealBreakglassTimeout: 15, + ServiceChecklist: defaultServiceChecklist(), + RequireCriticalServiceEndpoints: true, + CriticalServiceEndpointWaitSec: 420, + CriticalServiceEndpointPollSec: 5, + CriticalServiceEndpoints: defaultCriticalServiceEndpoints(), + RequireIngressChecklist: true, + IngressChecklistWaitSeconds: 420, + IngressChecklistPollSeconds: 5, + IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404}, + IngressChecklistIgnoreHosts: []string{}, + RequireNodeSSHAuth: true, + NodeSSHAuthWaitSeconds: 240, + NodeSSHAuthPollSeconds: 5, + NodeSSHAuthRequiredNodes: []string{}, + RequireFluxHealth: true, + FluxHealthWaitSeconds: 900, + FluxHealthPollSeconds: 5, + FluxHealthRequiredKustomizations: []string{}, + IgnoreFluxKustomizations: []string{}, + RequireWorkloadConvergence: true, + WorkloadConvergenceWaitSeconds: 900, + WorkloadConvergencePollSeconds: 5, + WorkloadConvergenceRequiredNamespaces: []string{}, + IgnoreWorkloadNamespaces: []string{}, + IgnoreWorkloads: []string{}, + IgnoreUnavailableNodes: []string{}, + AutoRecycleStuckPods: true, + StuckPodGraceSeconds: 180, + VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key", + VaultUnsealBreakglassTimeout: 15, }, Shutdown: Shutdown{ DefaultBudgetSeconds: 1380, diff --git a/internal/config/load_additional_test.go b/internal/config/load_additional_test.go index ff73f26..dd85311 100644 --- a/internal/config/load_additional_test.go +++ b/internal/config/load_additional_test.go @@ -51,3 +51,41 @@ startup: t.Fatalf("expected validation failure") } } + +// TestLoadKeepsExplicitServiceChecklist runs one orchestration or CLI step. +// Signature: TestLoadKeepsExplicitServiceChecklist(t *testing.T). +// Why: host recovery configs must be able to keep a narrow, explicit checklist +// without silently inheriting the full default service catalog. +func TestLoadKeepsExplicitServiceChecklist(t *testing.T) { + cfgPath := filepath.Join(t.TempDir(), "ananke.yaml") + raw := ` +control_planes: [titan-0a] +expected_flux_branch: main +expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git +iac_repo_path: /opt/titan-iac +startup: + service_checklist_explicit_only: true + service_checklist: + - name: gitea-api + url: https://scm.bstein.dev/api/healthz + accepted_statuses: [200] + body_contains: pass + timeout_seconds: 12 +ups: + enabled: false +` + if err := os.WriteFile(cfgPath, []byte(raw), 0o644); err != nil { + t.Fatalf("write config: %v", err) + } + + cfg, err := Load(cfgPath) + if err != nil { + t.Fatalf("load config: %v", err) + } + if len(cfg.Startup.ServiceChecklist) != 1 { + t.Fatalf("expected 1 explicit service check, got %d", len(cfg.Startup.ServiceChecklist)) + } + if cfg.Startup.ServiceChecklist[0].Name != "gitea-api" { + t.Fatalf("expected explicit gitea-api check, got %q", cfg.Startup.ServiceChecklist[0].Name) + } +} diff --git a/internal/config/types.go b/internal/config/types.go index 5b74c98..1ae797c 100644 --- a/internal/config/types.go +++ b/internal/config/types.go @@ -27,65 +27,70 @@ type Config struct { } type Startup struct { - APIWaitSeconds int `yaml:"api_wait_seconds"` - APIPollSeconds int `yaml:"api_poll_seconds"` - ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"` - MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"` - RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"` - NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"` - NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"` - RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"` - RequireTimeSync bool `yaml:"require_time_sync"` - TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` - TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` - TimeSyncMode string `yaml:"time_sync_mode"` - TimeSyncQuorum int `yaml:"time_sync_quorum"` - ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` - AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` - EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` - RequireStorageReady bool `yaml:"require_storage_ready"` - StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` - StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` - StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` - StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` - RequirePostStartProbes bool `yaml:"require_post_start_probes"` - PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` - PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` - PostStartProbes []string `yaml:"post_start_probes"` - RequireServiceChecklist bool `yaml:"require_service_checklist"` - ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"` - ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"` - ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"` - ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"` - ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"` - RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"` - CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"` - CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"` - CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"` - RequireIngressChecklist bool `yaml:"require_ingress_checklist"` - IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"` - IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"` - IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"` - IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"` - IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"` - RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"` - NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"` - NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"` - RequireFluxHealth bool `yaml:"require_flux_health"` - FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"` - FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"` - IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"` - RequireWorkloadConvergence bool `yaml:"require_workload_convergence"` - WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"` - WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"` - IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"` - IgnoreWorkloads []string `yaml:"ignore_workloads"` - IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"` - AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"` - StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"` - VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` - VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"` - VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"` + APIWaitSeconds int `yaml:"api_wait_seconds"` + APIPollSeconds int `yaml:"api_poll_seconds"` + ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"` + MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"` + RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"` + NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"` + NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"` + NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"` + RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"` + RequireTimeSync bool `yaml:"require_time_sync"` + TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"` + TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"` + TimeSyncMode string `yaml:"time_sync_mode"` + TimeSyncQuorum int `yaml:"time_sync_quorum"` + ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"` + AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"` + EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"` + RequireStorageReady bool `yaml:"require_storage_ready"` + StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"` + StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"` + StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"` + StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"` + RequirePostStartProbes bool `yaml:"require_post_start_probes"` + PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"` + PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"` + PostStartProbes []string `yaml:"post_start_probes"` + RequireServiceChecklist bool `yaml:"require_service_checklist"` + ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"` + ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"` + ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"` + ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"` + ServiceChecklistExplicitOnly bool `yaml:"service_checklist_explicit_only"` + ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"` + RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"` + CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"` + CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"` + CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"` + RequireIngressChecklist bool `yaml:"require_ingress_checklist"` + IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"` + IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"` + IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"` + IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"` + IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"` + RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"` + NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"` + NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"` + NodeSSHAuthRequiredNodes []string `yaml:"node_ssh_auth_required_nodes"` + RequireFluxHealth bool `yaml:"require_flux_health"` + FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"` + FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"` + FluxHealthRequiredKustomizations []string `yaml:"flux_health_required_kustomizations"` + IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"` + RequireWorkloadConvergence bool `yaml:"require_workload_convergence"` + WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"` + WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"` + WorkloadConvergenceRequiredNamespaces []string `yaml:"workload_convergence_required_namespaces"` + IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"` + IgnoreWorkloads []string `yaml:"ignore_workloads"` + IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"` + AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"` + StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"` + VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` + VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"` + VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"` } type ServiceChecklistCheck struct { diff --git a/internal/config/validate.go b/internal/config/validate.go index 9030bd8..e1123e9 100644 --- a/internal/config/validate.go +++ b/internal/config/validate.go @@ -61,6 +61,11 @@ func (c Config) Validate() error { if c.Startup.NodeInventoryReachPollSeconds <= 0 { return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0") } + for _, node := range c.Startup.NodeInventoryReachRequiredNodes { + if strings.TrimSpace(node) == "" { + return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty") + } + } for node, labels := range c.Startup.RequiredNodeLabels { if strings.TrimSpace(node) == "" { return fmt.Errorf("config.startup.required_node_labels keys must not be empty") @@ -233,18 +238,37 @@ func (c Config) Validate() error { if c.Startup.NodeSSHAuthPollSeconds <= 0 { return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0") } + for _, node := range c.Startup.NodeSSHAuthRequiredNodes { + if strings.TrimSpace(node) == "" { + return fmt.Errorf("config.startup.node_ssh_auth_required_nodes entries must not be empty") + } + } if c.Startup.FluxHealthWaitSeconds <= 0 { return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0") } if c.Startup.FluxHealthPollSeconds <= 0 { return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0") } + for _, item := range c.Startup.FluxHealthRequiredKustomizations { + item = strings.TrimSpace(item) + if item == "" { + return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must not be empty") + } + if strings.Count(item, "/") != 1 { + return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must be namespace/name, got %q", item) + } + } if c.Startup.WorkloadConvergenceWaitSeconds <= 0 { return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0") } if c.Startup.WorkloadConvergencePollSeconds <= 0 { return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0") } + for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces { + if strings.TrimSpace(ns) == "" { + return fmt.Errorf("config.startup.workload_convergence_required_namespaces entries must not be empty") + } + } if c.Startup.StuckPodGraceSeconds <= 0 { return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0") } @@ -277,6 +301,16 @@ func (c Config) Validate() error { return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty") } } + for _, item := range c.Startup.FluxHealthRequiredKustomizations { + if containsTrimmed(c.Startup.IgnoreFluxKustomizations, item) { + return fmt.Errorf("config.startup.flux_health_required_kustomizations must not overlap ignore_flux_kustomizations (%q)", strings.TrimSpace(item)) + } + } + for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces { + if containsTrimmed(c.Startup.IgnoreWorkloadNamespaces, ns) { + return fmt.Errorf("config.startup.workload_convergence_required_namespaces must not overlap ignore_workload_namespaces (%q)", strings.TrimSpace(ns)) + } + } for _, node := range c.Startup.IgnoreUnavailableNodes { if strings.TrimSpace(node) == "" { return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty") @@ -328,3 +362,20 @@ func (c Config) Validate() error { } return nil } + +// containsTrimmed runs one orchestration or CLI step. +// Signature: containsTrimmed(entries []string, needle string) bool. +// Why: startup config now supports both required and ignored recovery scopes, so +// validation needs a single normalized overlap check for those lists. +func containsTrimmed(entries []string, needle string) bool { + needle = strings.TrimSpace(needle) + if needle == "" { + return false + } + for _, entry := range entries { + if strings.TrimSpace(entry) == needle { + return true + } + } + return false +} diff --git a/internal/config/validate_matrix_test.go b/internal/config/validate_matrix_test.go index bc24d81..1d746f9 100644 --- a/internal/config/validate_matrix_test.go +++ b/internal/config/validate_matrix_test.go @@ -30,6 +30,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) { {"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }}, {"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }}, {"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }}, + {"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }}, {"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }}, {"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }}, {"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }}, @@ -68,15 +69,27 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) { {"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }}, {"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }}, {"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }}, + {"bad_empty_node_ssh_required_node", func(c *Config) { c.Startup.NodeSSHAuthRequiredNodes = []string{"titan-0a", ""} }}, {"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }}, {"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }}, + {"bad_empty_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", ""} }}, + {"bad_malformed_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system-core"} }}, {"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }}, {"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }}, + {"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }}, {"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }}, {"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }}, {"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }}, {"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }}, {"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }}, + {"bad_overlap_flux_required_and_ignored", func(c *Config) { + c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core"} + c.Startup.IgnoreFluxKustomizations = []string{"flux-system/core"} + }}, + {"bad_overlap_workload_required_and_ignored", func(c *Config) { + c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"} + c.Startup.IgnoreWorkloadNamespaces = []string{"monitoring"} + }}, {"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }}, {"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }}, {"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }}, @@ -121,6 +134,10 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) { if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" { t.Fatalf("expected startup defaults to be set") } + if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil || + cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil { + t.Fatalf("expected startup recovery scope slices to be initialized") + } if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 { t.Fatalf("expected critical service endpoint timing defaults to be set") } diff --git a/testing/orchestrator/hooks_flux_workload_matrix_test.go b/testing/orchestrator/hooks_flux_workload_matrix_test.go index 3cf7a22..4f93ecb 100644 --- a/testing/orchestrator/hooks_flux_workload_matrix_test.go +++ b/testing/orchestrator/hooks_flux_workload_matrix_test.go @@ -79,6 +79,29 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) { } }) + t.Run("flux-health-required-kustomizations-ignore-optional-failures", func(t *testing.T) { + cfg := lifecycleConfig(t) + cfg.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/monitoring"} + + run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + switch { + case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"): + return `{"items":[ +{"metadata":{"namespace":"flux-system","name":"monitoring"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}}, +{"metadata":{"namespace":"flux-system","name":"jellyfin"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"optional still down"}]}} +]}`, nil + default: + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + } + orch, _ := newHookOrchestrator(t, cfg, run, run) + ready, detail, err := orch.TestHookFluxHealthReady(context.Background()) + if err != nil || !ready || !strings.Contains(detail, "required kustomizations ready=") { + t.Fatalf("expected required flux kustomization scope to pass, ready=%v detail=%q err=%v", ready, detail, err) + } + }) + t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.WorkloadConvergenceWaitSeconds = 1 @@ -145,6 +168,42 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) { } }) + t.Run("required-workload-namespaces-ignore-optional-failure-pods", func(t *testing.T) { + cfg := lifecycleConfig(t) + cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"} + cfg.Startup.StuckPodGraceSeconds = 1 + + run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + switch { + case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"): + return `{"items":[ +{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}}, +{"kind":"Deployment","metadata":{"namespace":"jellyfin","name":"pegasus"},"spec":{"replicas":1},"status":{"readyReplicas":0}} +]}`, nil + case name == "kubectl" && strings.Contains(command, "get pods -A -o json"): + return `{"items":[ +{"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"grafana"}]},"status":{"containerStatuses":[{"state":{"running":{}}}]}}, +{"metadata":{"namespace":"jellyfin","name":"pegasus-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"pegasus"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"pegasus"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}} +]}`, nil + default: + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + } + orch, _ := newHookOrchestrator(t, cfg, run, run) + ready, detail, err := orch.TestHookWorkloadConvergenceReady(context.Background()) + if err != nil || !ready || !strings.Contains(detail, "controllers ready=") { + t.Fatalf("expected required workload namespace scope to pass, ready=%v detail=%q err=%v", ready, detail, err) + } + failures, err := orch.TestHookStartupFailurePods(context.Background()) + if err != nil { + t.Fatalf("startup failure pod query: %v", err) + } + if len(failures) != 0 { + t.Fatalf("expected optional namespace failures to be ignored, got %v", failures) + } + }) + t.Run("critical-workload-replica-heal-branches", func(t *testing.T) { cfg := lifecycleConfig(t) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { diff --git a/testing/orchestrator/hooks_ingress_service_matrix_test.go b/testing/orchestrator/hooks_ingress_service_matrix_test.go index 55275f7..a05ae34 100644 --- a/testing/orchestrator/hooks_ingress_service_matrix_test.go +++ b/testing/orchestrator/hooks_ingress_service_matrix_test.go @@ -53,6 +53,48 @@ func TestHookIngressServiceMatrix(t *testing.T) { } }) + t.Run("required-node-labels-skip-ignored-unavailable-nodes", func(t *testing.T) { + cfg := lifecycleConfig(t) + cfg.Startup.RequiredNodeLabels = map[string]map[string]string{ + "titan-09": { + "ananke.bstein.dev/harbor-bootstrap": "true", + }, + } + cfg.Startup.IgnoreUnavailableNodes = []string{"titan-09"} + run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") { + t.Fatalf("expected ignored unavailable node labels to be skipped, got %q", command) + } + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + orch, _ := newHookOrchestrator(t, cfg, run, run) + if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil { + t.Fatalf("expected ignored unavailable node label enforcement to be skipped, got %v", err) + } + }) + + t.Run("required-node-labels-skip-absent-non-core-nodes", func(t *testing.T) { + cfg := lifecycleConfig(t) + cfg.Startup.RequiredNodeLabels = map[string]map[string]string{ + "titan-09": { + "ananke.bstein.dev/harbor-bootstrap": "true", + }, + } + cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"} + run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") { + return "", errors.New("Error from server (NotFound): nodes \"titan-09\" not found") + } + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + orch, _ := newHookOrchestrator(t, cfg, run, run) + if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil { + t.Fatalf("expected absent non-core node label enforcement to be skipped, got %v", err) + } + }) + t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) { tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) diff --git a/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go b/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go index 71669a6..411b374 100644 --- a/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go +++ b/testing/orchestrator/hooks_lifecycle_cleanup_branch_matrix_test.go @@ -633,7 +633,7 @@ func TestHookLifecycleCleanupRemainingClosure(t *testing.T) { switch { case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): apiVersionCalls++ - if apiVersionCalls == 1 { + if apiVersionCalls <= 2 { return "", errors.New("api down") } return "v1.31.0", nil diff --git a/testing/orchestrator/hooks_startup_scope_vault_test.go b/testing/orchestrator/hooks_startup_scope_vault_test.go new file mode 100644 index 0000000..6d9b0fe --- /dev/null +++ b/testing/orchestrator/hooks_startup_scope_vault_test.go @@ -0,0 +1,222 @@ +package orchestrator + +import ( + "context" + "errors" + "os" + "strings" + "testing" + "time" + + "scm.bstein.dev/bstein/ananke/internal/cluster" +) + +// readStartupProgress runs one orchestration or CLI step. +// Signature: readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string. +// Why: startup helper tests need to inspect progress artifacts without reaching +// into internal package state from the top-level testing module. +func readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string { + t.Helper() + payload, err := os.ReadFile(orch.TestHookStartupProgressPath()) + if err != nil { + t.Fatalf("read startup progress: %v", err) + } + return string(payload) +} + +// TestHookStartupScopeAndVaultHelpers runs one orchestration or CLI step. +// Signature: TestHookStartupScopeAndVaultHelpers(t *testing.T). +// Why: keeps startup-scope and startup-Vault helper branches covered from the +// split top-level testing module required by the repo hygiene contract. +func TestHookStartupScopeAndVaultHelpers(t *testing.T) { + t.Run("startup-scope-helpers", func(t *testing.T) { + nodes := []string{"titan-db", " titan-23 ", "", "titan-24"} + if got := cluster.TestHookStartupRequiredNodes(nodes, nil); len(got) != len(nodes) { + t.Fatalf("expected passthrough node list, got %v", got) + } + got := cluster.TestHookStartupRequiredNodes(nodes, []string{"titan-db", "titan-24"}) + if len(got) != 2 || got[0] != "titan-db" || got[1] != "titan-24" { + t.Fatalf("unexpected filtered node list: %v", got) + } + + if !cluster.TestHookContainsNode([]string{"titan-db", " titan-23 "}, " titan-23 ") { + t.Fatalf("expected trimmed node membership match") + } + if cluster.TestHookContainsNode([]string{"titan-db"}, " ") { + t.Fatalf("expected blank node probe to be ignored") + } + + cfg := lifecycleConfig(t) + orch, _ := newHookOrchestrator(t, cfg, nil, nil) + if !orch.TestHookStartupNodeStrictlyRequired("titan-23") { + t.Fatalf("expected all nodes to be strict when no recovery scopes are configured") + } + + cfgScoped := lifecycleConfig(t) + cfgScoped.Startup.NodeInventoryReachRequiredNodes = []string{"titan-23"} + cfgScoped.Startup.NodeSSHAuthRequiredNodes = []string{"titan-24"} + cfgScoped.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", " flux-system/gitea "} + cfgScoped.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault", " monitoring "} + orchScoped, _ := newHookOrchestrator(t, cfgScoped, nil, nil) + if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-db") { + t.Fatalf("expected control plane to remain strict") + } + if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-23") { + t.Fatalf("expected inventory-scoped node to remain strict") + } + if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-24") { + t.Fatalf("expected ssh-scoped node to remain strict") + } + if orchScoped.TestHookStartupNodeStrictlyRequired("titan-25") { + t.Fatalf("expected non-core worker to stop being strict") + } + + flux := orchScoped.TestHookStartupRequiredFluxKustomizations() + if _, ok := flux["flux-system/core"]; !ok { + t.Fatalf("expected core flux kustomization in required set: %v", flux) + } + if _, ok := flux["flux-system/gitea"]; !ok { + t.Fatalf("expected trimmed gitea kustomization in required set: %v", flux) + } + + namespaces := orchScoped.TestHookStartupRequiredWorkloadNamespaces() + if _, ok := namespaces["vault"]; !ok { + t.Fatalf("expected vault namespace in required set: %v", namespaces) + } + if _, ok := namespaces["monitoring"]; !ok { + t.Fatalf("expected trimmed monitoring namespace in required set: %v", namespaces) + } + }) + + t.Run("startup-vault-helpers", func(t *testing.T) { + t.Run("early-vault-unseal-paths", func(t *testing.T) { + cfgAPI := lifecycleConfig(t) + runAPI := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") { + return "", errors.New("api down") + } + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + orchAPI, _ := newHookOrchestrator(t, cfgAPI, runAPI, runAPI) + orchAPI.TestHookBeginStartupReport("startup-vault") + orchAPI.TestHookMaybeRunEarlyVaultUnseal(context.Background()) + if payload := readStartupProgress(t, orchAPI); strings.Contains(payload, "vault-unseal-early") { + t.Fatalf("expected no early vault check when api is unavailable, payload=%s", payload) + } + + cfgErr := lifecycleConfig(t) + runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + switch { + case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): + return "v1.31.0", nil + case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"): + return "", errors.New("phase probe failed") + default: + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + } + orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr) + orchErr.TestHookBeginStartupReport("startup-vault") + orchErr.TestHookMaybeRunEarlyVaultUnseal(context.Background()) + if payload := readStartupProgress(t, orchErr); !strings.Contains(payload, "deferred early vault unseal") { + t.Fatalf("expected early vault auto-heal detail, payload=%s", payload) + } + + cfgDeferred := lifecycleConfig(t) + runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + switch { + case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): + return "v1.31.0", nil + case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"): + return "Pending", nil + default: + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + } + orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred) + orchDeferred.TestHookBeginStartupReport("startup-vault") + orchDeferred.TestHookMaybeRunEarlyVaultUnseal(context.Background()) + if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) { + t.Fatalf("expected deferred early vault detail, payload=%s", payload) + } + + cfgSuccess := lifecycleConfig(t) + runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + switch { + case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): + return "v1.31.0", nil + case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"): + return "Running", nil + case name == "kubectl" && strings.Contains(command, "vault status -format=json"): + return `{"sealed":false,"initialized":true}`, nil + default: + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + } + orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess) + orchSuccess.TestHookBeginStartupReport("startup-vault") + orchSuccess.TestHookMaybeRunEarlyVaultUnseal(context.Background()) + if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault-unseal-early"`) || !strings.Contains(payload, `"passed"`) { + t.Fatalf("expected successful early vault check, payload=%s", payload) + } + }) + + t.Run("startup-vault-gate-paths", func(t *testing.T) { + cfgErr := lifecycleConfig(t) + runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") { + return "", errors.New("phase probe failed") + } + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr) + orchErr.TestHookBeginStartupReport("startup-vault") + if err := orchErr.TestHookRunStartupVaultUnsealGate(context.Background()); err == nil || !strings.Contains(err.Error(), "phase probe failed") { + t.Fatalf("expected startup vault gate error, got %v", err) + } + + cfgDeferred := lifecycleConfig(t) + runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") { + return "Pending", nil + } + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred) + orchDeferred.TestHookBeginStartupReport("startup-vault") + if err := orchDeferred.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil { + t.Fatalf("expected deferred startup vault gate to succeed, got %v", err) + } + if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) { + t.Fatalf("expected deferred startup vault detail, payload=%s", payload) + } + + cfgSuccess := lifecycleConfig(t) + runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + switch { + case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"): + return "Running", nil + case name == "kubectl" && strings.Contains(command, "vault status -format=json"): + return `{"sealed":false,"initialized":true}`, nil + default: + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + } + orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess) + orchSuccess.TestHookBeginStartupReport("startup-vault") + if err := orchSuccess.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil { + t.Fatalf("expected successful startup vault gate, got %v", err) + } + if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault is unsealed"`) { + t.Fatalf("expected successful startup vault detail, payload=%s", payload) + } + }) + }) +} diff --git a/testing/orchestrator/hooks_workload_storage_access_matrix_test.go b/testing/orchestrator/hooks_workload_storage_access_matrix_test.go index bad5e3f..2871191 100644 --- a/testing/orchestrator/hooks_workload_storage_access_matrix_test.go +++ b/testing/orchestrator/hooks_workload_storage_access_matrix_test.go @@ -165,6 +165,32 @@ func TestHookWorkloadStorageAccessMatrix(t *testing.T) { t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err) } }) + + t.Run("wait-for-node-gates-only-require-core-nodes", func(t *testing.T) { + cfg := lifecycleConfig(t) + cfg.Startup.RequireNodeSSHAuth = true + cfg.Startup.NodeSSHAuthWaitSeconds = 1 + cfg.Startup.NodeSSHAuthPollSeconds = 1 + cfg.Startup.NodeInventoryReachWaitSeconds = 1 + cfg.Startup.NodeInventoryReachPollSeconds = 1 + cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"} + cfg.Startup.NodeSSHAuthRequiredNodes = []string{"titan-db"} + + run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { + command := name + " " + strings.Join(args, " ") + if name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "__ANANKE_") { + return "", errors.New("no route to host") + } + return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) + } + orch, _ := newHookOrchestrator(t, cfg, run, run) + if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err != nil { + t.Fatalf("expected ssh-auth gate to ignore non-core worker failure, got %v", err) + } + if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err != nil { + t.Fatalf("expected inventory reachability gate to ignore non-core worker failure, got %v", err) + } + }) }) t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {