startup(ananke): scope emergency recovery to core services

This commit is contained in:
codex 2026-05-05 05:17:59 -03:00
parent a3e24b9b15
commit 1f656de5df
27 changed files with 946 additions and 139 deletions

View File

@ -97,10 +97,15 @@ Primary config path:
Keep these fields accurate:
- `expected_flux_source_url`
- `expected_flux_branch`
- `startup.service_checklist_explicit_only`
- `startup.service_checklist`
- `startup.critical_service_endpoints`
- `startup.require_ingress_checklist`
- `startup.require_node_inventory_reachability`
- `startup.node_inventory_reachability_required_nodes`
- `startup.node_ssh_auth_required_nodes`
- `startup.flux_health_required_kustomizations`
- `startup.workload_convergence_required_namespaces`
- `startup.ignore_unavailable_nodes`
- `coordination.role`
- `coordination.peer_hosts`
@ -134,9 +139,10 @@ Installer behavior:
When adding nodes or services:
1. Update inventory and node mapping in config.
2. Add/adjust service checklist entries for anything user-facing or critical.
3. Add/adjust ingress expectations for exposed services.
4. Use temporary ignores only when truly intentional, then remove them.
5. Run `scripts/quality_gate.sh` before host deployment.
2. Keep the explicit service checklist focused on the core services that must come back during an outage.
3. Keep `*_required_*` startup scopes aligned with the same core set so optional stacks do not block bootstrap.
4. Add/adjust ingress expectations for exposed services.
5. Use temporary ignores only when truly intentional, then remove them.
6. Run `scripts/quality_gate.sh` before host deployment.
Recovery quality should improve over time: every drill should reduce manual work in the next drill.

View File

@ -51,6 +51,7 @@ startup:
require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5
node_inventory_reachability_required_nodes: []
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
@ -90,6 +91,7 @@ startup:
admin_secret_name: keycloak-admin
admin_secret_username_key: username
admin_secret_password_key: password
service_checklist_explicit_only: false
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
@ -134,13 +136,16 @@ startup:
require_node_ssh_auth: true
node_ssh_auth_wait_seconds: 240
node_ssh_auth_poll_seconds: 5
node_ssh_auth_required_nodes: []
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
flux_health_required_kustomizations: []
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
workload_convergence_required_namespaces: []
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []

View File

@ -117,6 +117,10 @@ startup:
require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5
node_inventory_reachability_required_nodes:
- titan-0a
- titan-0b
- titan-0c
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
@ -156,6 +160,7 @@ startup:
admin_secret_name: keycloak-admin
admin_secret_username_key: username
admin_secret_password_key: password
service_checklist_explicit_only: true
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
@ -200,13 +205,39 @@ startup:
require_node_ssh_auth: true
node_ssh_auth_wait_seconds: 240
node_ssh_auth_poll_seconds: 5
node_ssh_auth_required_nodes:
- titan-0a
- titan-0b
- titan-0c
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
flux_health_required_kustomizations:
- flux-system/core
- flux-system/helm
- flux-system/traefik
- flux-system/cert-manager
- flux-system/longhorn
- flux-system/vault-csi
- flux-system/vault-injector
- flux-system/postgres
- flux-system/vault
- flux-system/keycloak
- flux-system/oauth2-proxy
- flux-system/gitea
- flux-system/monitoring
- flux-system/harbor
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
workload_convergence_required_namespaces:
- vault
- postgres
- sso
- gitea
- monitoring
- harbor
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []

View File

@ -117,6 +117,10 @@ startup:
require_node_inventory_reachability: true
node_inventory_reachability_wait_seconds: 300
node_inventory_reachability_poll_seconds: 5
node_inventory_reachability_required_nodes:
- titan-0a
- titan-0b
- titan-0c
required_node_labels:
titan-09:
ananke.bstein.dev/harbor-bootstrap: "true"
@ -156,6 +160,7 @@ startup:
admin_secret_name: keycloak-admin
admin_secret_username_key: username
admin_secret_password_key: password
service_checklist_explicit_only: true
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
@ -200,13 +205,39 @@ startup:
require_node_ssh_auth: true
node_ssh_auth_wait_seconds: 240
node_ssh_auth_poll_seconds: 5
node_ssh_auth_required_nodes:
- titan-0a
- titan-0b
- titan-0c
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5
flux_health_required_kustomizations:
- flux-system/core
- flux-system/helm
- flux-system/traefik
- flux-system/cert-manager
- flux-system/longhorn
- flux-system/vault-csi
- flux-system/vault-injector
- flux-system/postgres
- flux-system/vault
- flux-system/keycloak
- flux-system/oauth2-proxy
- flux-system/gitea
- flux-system/monitoring
- flux-system/harbor
ignore_flux_kustomizations: []
require_workload_convergence: true
workload_convergence_wait_seconds: 900
workload_convergence_poll_seconds: 5
workload_convergence_required_namespaces:
- vault
- postgres
- sso
- gitea
- monitoring
- harbor
ignore_workload_namespaces: []
ignore_workloads: []
ignore_unavailable_nodes: []

View File

@ -77,7 +77,7 @@ func (o *Orchestrator) waitForNodeSSHAuth(ctx context.Context, nodes []string) e
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
seen := map[string]struct{}{}
targets := make([]string, 0, len(nodes))
for _, node := range nodes {
for _, node := range startupRequiredNodes(nodes, o.cfg.Startup.NodeSSHAuthRequiredNodes) {
node = strings.TrimSpace(node)
if node == "" {
continue

View File

@ -227,6 +227,31 @@ func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) er
return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
}
// ensureVaultUnsealedWhenRunnable runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error).
// Why: lets startup defer vault unseal until the pod is actually runnable, while
// keeping the direct unseal helper strict for explicit recovery paths and tests.
func (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error) {
if o.runner.DryRun {
return false, "", nil
}
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
if err != nil {
if isNotFoundErr(err) {
return true, "vault-0 pod is not present yet; deferring unseal until critical workload recovery", nil
}
return false, "", fmt.Errorf("vault pod phase check failed: %w", err)
}
trimmedPhase := strings.TrimSpace(phase)
if trimmedPhase != "Running" {
return true, fmt.Sprintf("vault-0 pod phase is %q; deferring unseal until critical workload recovery", trimmedPhase), nil
}
return false, "", o.ensureVaultUnsealed(ctx)
}
// ensureVaultUnsealed runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.

View File

@ -143,6 +143,8 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
}
ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
required := o.startupRequiredFluxKustomizations()
requiredSeen := map[string]struct{}{}
notReady := []string{}
for _, ks := range list.Items {
ns := strings.TrimSpace(ks.Metadata.Namespace)
@ -154,6 +156,12 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
if ks.Spec.Suspend {
continue
}
if len(required) > 0 {
if _, ok := required[full]; !ok {
continue
}
requiredSeen[full] = struct{}{}
}
if _, ok := ignored[full]; ok {
continue
}
@ -173,10 +181,25 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
}
notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
}
if len(required) > 0 {
missing := []string{}
for full := range required {
if _, ok := requiredSeen[full]; !ok {
missing = append(missing, full+"(missing)")
}
}
if len(missing) > 0 {
sort.Strings(missing)
notReady = append(notReady, missing...)
}
}
if len(notReady) > 0 {
sort.Strings(notReady)
return false, "not ready: " + joinLimited(notReady, 6), nil
}
if len(required) > 0 {
return true, fmt.Sprintf("required kustomizations ready=%d", len(requiredSeen)), nil
}
return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
}

View File

@ -19,6 +19,7 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
return nil
}
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
for node := range o.cfg.Startup.RequiredNodeLabels {
node = strings.TrimSpace(node)
@ -28,6 +29,10 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
}
sort.Strings(nodes)
for _, node := range nodes {
if _, skip := ignored[node]; skip {
o.log.Printf("skipping required node labels for ignored unavailable node %s", node)
continue
}
labels := o.cfg.Startup.RequiredNodeLabels[node]
if len(labels) == 0 {
continue
@ -55,6 +60,11 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
continue
}
if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {
o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)
o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))
continue
}
return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
}
o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))

View File

@ -37,14 +37,7 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
return invErr
}
o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
if err := o.waitForAPI(ctx, 1, time.Second); err == nil {
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed while kubernetes api is already available")
if err := o.ensureVaultUnsealed(ctx); err != nil {
o.noteStartupCheck("vault-unseal", false, err.Error())
return err
}
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
}
o.maybeRunEarlyVaultUnseal(ctx)
o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
@ -187,12 +180,9 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
}
}
o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
if err := o.ensureVaultUnsealed(ctx); err != nil {
o.noteStartupCheck("vault-unseal", false, err.Error())
if err := o.runStartupVaultUnsealGate(ctx); err != nil {
return err
}
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
return err
}
@ -490,18 +480,3 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
o.log.Printf("shutdown flow complete")
return nil
}
// normalizeShutdownMode runs one orchestration or CLI step.
// Signature: normalizeShutdownMode(raw string) (string, error).
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
// semantics while preserving compatibility with legacy "config" callers.
func normalizeShutdownMode(raw string) (string, error) {
switch strings.TrimSpace(raw) {
case "", "config", "cluster-only":
return "cluster-only", nil
case "poweroff":
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
default:
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
}
}

View File

@ -30,7 +30,7 @@ func (o *Orchestrator) waitForNodeInventoryReachability(ctx context.Context) err
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
targets := make([]string, 0, len(o.inventoryNodesForValidation()))
seen := map[string]struct{}{}
for _, node := range o.inventoryNodesForValidation() {
for _, node := range startupRequiredNodes(o.inventoryNodesForValidation(), o.cfg.Startup.NodeInventoryReachRequiredNodes) {
node = strings.TrimSpace(node)
if node == "" {
continue

View File

@ -0,0 +1,21 @@
package cluster
import (
"fmt"
"strings"
)
// normalizeShutdownMode runs one orchestration or CLI step.
// Signature: normalizeShutdownMode(raw string) (string, error).
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
// semantics while preserving compatibility with legacy "config" callers.
func normalizeShutdownMode(raw string) (string, error) {
switch strings.TrimSpace(raw) {
case "", "config", "cluster-only":
return "cluster-only", nil
case "poweroff":
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
default:
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
}
}

View File

@ -0,0 +1,81 @@
package cluster
import "strings"
// startupRequiredNodes runs one orchestration or CLI step.
// Signature: startupRequiredNodes(nodes []string, required []string) []string.
// Why: lets startup enforce a smaller core node set during outage recovery
// without losing the stricter all-nodes behavior when no override is configured.
func startupRequiredNodes(nodes []string, required []string) []string {
requiredSet := makeStringSet(required)
if len(requiredSet) == 0 {
return nodes
}
filtered := make([]string, 0, len(nodes))
for _, node := range nodes {
node = strings.TrimSpace(node)
if node == "" {
continue
}
if _, ok := requiredSet[node]; ok {
filtered = append(filtered, node)
}
}
return filtered
}
// startupNodeStrictlyRequired runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupNodeStrictlyRequired(node string) bool.
// Why: absent or broken non-core nodes should not block recovery-only actions
// like label reconciliation once the operator has narrowed startup to core nodes.
func (o *Orchestrator) startupNodeStrictlyRequired(node string) bool {
node = strings.TrimSpace(node)
if node == "" {
return false
}
if len(o.cfg.Startup.NodeInventoryReachRequiredNodes) == 0 && len(o.cfg.Startup.NodeSSHAuthRequiredNodes) == 0 {
return true
}
for _, controlPlane := range o.cfg.ControlPlanes {
if strings.TrimSpace(controlPlane) == node {
return true
}
}
if containsNode(o.cfg.Startup.NodeInventoryReachRequiredNodes, node) {
return true
}
return containsNode(o.cfg.Startup.NodeSSHAuthRequiredNodes, node)
}
// startupRequiredFluxKustomizations runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{}.
// Why: lets outage recovery wait on a declared core GitOps slice while leaving
// optional stacks free to converge after bootstrap succeeds.
func (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{} {
return makeStringSet(o.cfg.Startup.FluxHealthRequiredKustomizations)
}
// startupRequiredWorkloadNamespaces runs one orchestration or CLI step.
// Signature: (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{}.
// Why: keeps workload readiness scoped to core namespaces during recovery while
// preserving broad convergence checks when no explicit core list is configured.
func (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{} {
return makeStringSet(o.cfg.Startup.WorkloadConvergenceRequiredNamespaces)
}
// containsNode runs one orchestration or CLI step.
// Signature: containsNode(entries []string, needle string) bool.
// Why: keeps node-scope checks small and explicit anywhere startup narrows its
// recovery gates to a declared core set.
func containsNode(entries []string, needle string) bool {
needle = strings.TrimSpace(needle)
if needle == "" {
return false
}
for _, entry := range entries {
if strings.TrimSpace(entry) == needle {
return true
}
}
return false
}

View File

@ -0,0 +1,52 @@
package cluster
import (
"context"
"fmt"
"time"
)
// maybeRunEarlyVaultUnseal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context).
// Why: gives startup a best-effort Vault recovery path when the API is already
// live, without consuming the hard startup failure path before workloads recover.
func (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context) {
if err := o.waitForAPI(ctx, 1, time.Second); err != nil {
return
}
o.noteStartupCheckState("vault-unseal-early", "running", "best-effort early vault unseal while kubernetes api is already available")
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
if err != nil {
o.log.Printf("warning: early vault unseal deferred: %v", err)
o.noteStartupAutoHeal(fmt.Sprintf("deferred early vault unseal: %v", err))
return
}
if deferred {
o.log.Printf("vault early unseal deferred: %s", detail)
o.noteStartupAutoHeal(detail)
return
}
o.noteStartupCheck("vault-unseal-early", true, "vault is already unsealed")
}
// runStartupVaultUnsealGate runs one orchestration or CLI step.
// Signature: (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error.
// Why: keeps the top-level startup flow readable while allowing Vault unseal to
// defer cleanly until critical workload recovery when the pod is not runnable yet.
func (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error {
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
if err != nil {
o.noteStartupCheck("vault-unseal", false, err.Error())
return err
}
if deferred {
o.log.Printf("vault unseal deferred until workload recovery: %s", detail)
o.noteStartupAutoHeal(detail)
o.noteStartupCheck("vault-unseal", true, detail)
return nil
}
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
return nil
}

View File

@ -71,6 +71,7 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
if err := json.Unmarshal([]byte(out), &list); err != nil {
return false, "", fmt.Errorf("decode controllers: %w", err)
}
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
@ -84,6 +85,11 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
if kind == "" || ns == "" || name == "" {
continue
}
if len(requiredNamespaces) > 0 {
if _, ok := requiredNamespaces[ns]; !ok {
continue
}
}
if _, ok := ignoredNamespaces[ns]; ok {
continue
}

View File

@ -116,6 +116,7 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
return nil, fmt.Errorf("decode pods: %w", err)
}
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
stuckReasons := map[string]struct{}{
@ -138,6 +139,11 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
if ns == "" || name == "" {
continue
}
if len(requiredNamespaces) > 0 {
if _, ok := requiredNamespaces[ns]; !ok {
continue
}
}
if _, ok := ignoredNamespaces[ns]; ok {
continue
}

View File

@ -0,0 +1,55 @@
package cluster
import "context"
// TestHookStartupRequiredNodes runs one orchestration or CLI step.
// Signature: TestHookStartupRequiredNodes(nodes []string, required []string) []string.
// Why: exposes recovery-scope node filtering so top-level tests can cover core-only startup narrowing.
func TestHookStartupRequiredNodes(nodes []string, required []string) []string {
return startupRequiredNodes(nodes, required)
}
// TestHookContainsNode runs one orchestration or CLI step.
// Signature: TestHookContainsNode(entries []string, needle string) bool.
// Why: exposes the small startup-scope membership helper to top-level tests.
func TestHookContainsNode(entries []string, needle string) bool {
return containsNode(entries, needle)
}
// TestHookStartupNodeStrictlyRequired runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool.
// Why: exposes strict-node startup scoping so outage-recovery tests can confirm
// non-core nodes stop blocking bootstrap.
func (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool {
return o.startupNodeStrictlyRequired(node)
}
// TestHookStartupRequiredFluxKustomizations runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{}.
// Why: exposes flux startup scoping so top-level tests can confirm only core
// kustomizations block emergency bootstrap.
func (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{} {
return o.startupRequiredFluxKustomizations()
}
// TestHookStartupRequiredWorkloadNamespaces runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{}.
// Why: exposes workload namespace startup scoping so top-level tests can
// confirm only core workloads block emergency bootstrap.
func (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{} {
return o.startupRequiredWorkloadNamespaces()
}
// TestHookMaybeRunEarlyVaultUnseal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context).
// Why: exposes the early startup Vault deferral helper to top-level tests.
func (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context) {
o.maybeRunEarlyVaultUnseal(ctx)
}
// TestHookRunStartupVaultUnsealGate runs one orchestration or CLI step.
// Signature: (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error.
// Why: exposes the startup Vault gate helper to top-level tests.
func (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error {
return o.runStartupVaultUnsealGate(ctx)
}

View File

@ -33,6 +33,9 @@ func (c *Config) applyDefaults() {
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
c.Startup.NodeInventoryReachPollSeconds = 5
}
if c.Startup.NodeInventoryReachRequiredNodes == nil {
c.Startup.NodeInventoryReachRequiredNodes = []string{}
}
if c.Startup.RequiredNodeLabels == nil {
c.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
@ -121,7 +124,11 @@ func (c *Config) applyDefaults() {
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
}
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
if c.Startup.ServiceChecklistExplicitOnly {
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, []ServiceChecklistCheck{})
} else {
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
}
for i := range c.Startup.ServiceChecklist {
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
@ -152,12 +159,18 @@ func (c *Config) applyDefaults() {
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
c.Startup.NodeSSHAuthPollSeconds = 5
}
if c.Startup.NodeSSHAuthRequiredNodes == nil {
c.Startup.NodeSSHAuthRequiredNodes = []string{}
}
if c.Startup.FluxHealthWaitSeconds <= 0 {
c.Startup.FluxHealthWaitSeconds = 900
}
if c.Startup.FluxHealthPollSeconds <= 0 {
c.Startup.FluxHealthPollSeconds = 5
}
if c.Startup.FluxHealthRequiredKustomizations == nil {
c.Startup.FluxHealthRequiredKustomizations = []string{}
}
if c.Startup.IgnoreFluxKustomizations == nil {
c.Startup.IgnoreFluxKustomizations = []string{}
}
@ -167,6 +180,9 @@ func (c *Config) applyDefaults() {
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
c.Startup.WorkloadConvergencePollSeconds = 5
}
if c.Startup.WorkloadConvergenceRequiredNamespaces == nil {
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{}
}
if c.Startup.IgnoreWorkloadNamespaces == nil {
c.Startup.IgnoreWorkloadNamespaces = []string{}
}

View File

@ -39,24 +39,25 @@ func defaults() Config {
"maintenance",
},
Startup: Startup{
APIWaitSeconds: 1200,
APIPollSeconds: 2,
ShutdownCooldownSeconds: 45,
RequireNodeInventoryReach: true,
NodeInventoryReachWaitSeconds: 300,
NodeInventoryReachPollSeconds: 5,
RequireTimeSync: true,
TimeSyncWaitSeconds: 240,
TimeSyncPollSeconds: 5,
TimeSyncMode: "quorum",
TimeSyncQuorum: 2,
ReconcileAccessOnBoot: true,
AutoEtcdRestoreOnAPIFailure: true,
EtcdRestoreControlPlane: "titan-0a",
RequireStorageReady: true,
StorageReadyWaitSeconds: 420,
StorageReadyPollSeconds: 5,
StorageMinReadyNodes: 2,
APIWaitSeconds: 1200,
APIPollSeconds: 2,
ShutdownCooldownSeconds: 45,
RequireNodeInventoryReach: true,
NodeInventoryReachWaitSeconds: 300,
NodeInventoryReachPollSeconds: 5,
NodeInventoryReachRequiredNodes: []string{},
RequireTimeSync: true,
TimeSyncWaitSeconds: 240,
TimeSyncPollSeconds: 5,
TimeSyncMode: "quorum",
TimeSyncQuorum: 2,
ReconcileAccessOnBoot: true,
AutoEtcdRestoreOnAPIFailure: true,
EtcdRestoreControlPlane: "titan-0a",
RequireStorageReady: true,
StorageReadyWaitSeconds: 420,
StorageReadyPollSeconds: 5,
StorageMinReadyNodes: 2,
StorageCriticalPVCs: []string{
"vault/data-vault-0",
"postgres/postgres-data-postgres-0",
@ -91,33 +92,36 @@ func defaults() Config {
AdminSecretUsernameKey: "username",
AdminSecretPasswordKey: "password",
},
ServiceChecklist: defaultServiceChecklist(),
RequireCriticalServiceEndpoints: true,
CriticalServiceEndpointWaitSec: 420,
CriticalServiceEndpointPollSec: 5,
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
RequireIngressChecklist: true,
IngressChecklistWaitSeconds: 420,
IngressChecklistPollSeconds: 5,
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
IngressChecklistIgnoreHosts: []string{},
RequireNodeSSHAuth: true,
NodeSSHAuthWaitSeconds: 240,
NodeSSHAuthPollSeconds: 5,
RequireFluxHealth: true,
FluxHealthWaitSeconds: 900,
FluxHealthPollSeconds: 5,
IgnoreFluxKustomizations: []string{},
RequireWorkloadConvergence: true,
WorkloadConvergenceWaitSeconds: 900,
WorkloadConvergencePollSeconds: 5,
IgnoreWorkloadNamespaces: []string{},
IgnoreWorkloads: []string{},
IgnoreUnavailableNodes: []string{},
AutoRecycleStuckPods: true,
StuckPodGraceSeconds: 180,
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
VaultUnsealBreakglassTimeout: 15,
ServiceChecklist: defaultServiceChecklist(),
RequireCriticalServiceEndpoints: true,
CriticalServiceEndpointWaitSec: 420,
CriticalServiceEndpointPollSec: 5,
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
RequireIngressChecklist: true,
IngressChecklistWaitSeconds: 420,
IngressChecklistPollSeconds: 5,
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
IngressChecklistIgnoreHosts: []string{},
RequireNodeSSHAuth: true,
NodeSSHAuthWaitSeconds: 240,
NodeSSHAuthPollSeconds: 5,
NodeSSHAuthRequiredNodes: []string{},
RequireFluxHealth: true,
FluxHealthWaitSeconds: 900,
FluxHealthPollSeconds: 5,
FluxHealthRequiredKustomizations: []string{},
IgnoreFluxKustomizations: []string{},
RequireWorkloadConvergence: true,
WorkloadConvergenceWaitSeconds: 900,
WorkloadConvergencePollSeconds: 5,
WorkloadConvergenceRequiredNamespaces: []string{},
IgnoreWorkloadNamespaces: []string{},
IgnoreWorkloads: []string{},
IgnoreUnavailableNodes: []string{},
AutoRecycleStuckPods: true,
StuckPodGraceSeconds: 180,
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
VaultUnsealBreakglassTimeout: 15,
},
Shutdown: Shutdown{
DefaultBudgetSeconds: 1380,

View File

@ -51,3 +51,41 @@ startup:
t.Fatalf("expected validation failure")
}
}
// TestLoadKeepsExplicitServiceChecklist runs one orchestration or CLI step.
// Signature: TestLoadKeepsExplicitServiceChecklist(t *testing.T).
// Why: host recovery configs must be able to keep a narrow, explicit checklist
// without silently inheriting the full default service catalog.
func TestLoadKeepsExplicitServiceChecklist(t *testing.T) {
cfgPath := filepath.Join(t.TempDir(), "ananke.yaml")
raw := `
control_planes: [titan-0a]
expected_flux_branch: main
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
iac_repo_path: /opt/titan-iac
startup:
service_checklist_explicit_only: true
service_checklist:
- name: gitea-api
url: https://scm.bstein.dev/api/healthz
accepted_statuses: [200]
body_contains: pass
timeout_seconds: 12
ups:
enabled: false
`
if err := os.WriteFile(cfgPath, []byte(raw), 0o644); err != nil {
t.Fatalf("write config: %v", err)
}
cfg, err := Load(cfgPath)
if err != nil {
t.Fatalf("load config: %v", err)
}
if len(cfg.Startup.ServiceChecklist) != 1 {
t.Fatalf("expected 1 explicit service check, got %d", len(cfg.Startup.ServiceChecklist))
}
if cfg.Startup.ServiceChecklist[0].Name != "gitea-api" {
t.Fatalf("expected explicit gitea-api check, got %q", cfg.Startup.ServiceChecklist[0].Name)
}
}

View File

@ -27,65 +27,70 @@ type Config struct {
}
type Startup struct {
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
TimeSyncMode string `yaml:"time_sync_mode"`
TimeSyncQuorum int `yaml:"time_sync_quorum"`
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
RequireStorageReady bool `yaml:"require_storage_ready"`
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
PostStartProbes []string `yaml:"post_start_probes"`
RequireServiceChecklist bool `yaml:"require_service_checklist"`
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
RequireFluxHealth bool `yaml:"require_flux_health"`
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
IgnoreWorkloads []string `yaml:"ignore_workloads"`
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
APIWaitSeconds int `yaml:"api_wait_seconds"`
APIPollSeconds int `yaml:"api_poll_seconds"`
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"`
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
RequireTimeSync bool `yaml:"require_time_sync"`
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
TimeSyncMode string `yaml:"time_sync_mode"`
TimeSyncQuorum int `yaml:"time_sync_quorum"`
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
RequireStorageReady bool `yaml:"require_storage_ready"`
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
PostStartProbes []string `yaml:"post_start_probes"`
RequireServiceChecklist bool `yaml:"require_service_checklist"`
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
ServiceChecklistExplicitOnly bool `yaml:"service_checklist_explicit_only"`
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
NodeSSHAuthRequiredNodes []string `yaml:"node_ssh_auth_required_nodes"`
RequireFluxHealth bool `yaml:"require_flux_health"`
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
FluxHealthRequiredKustomizations []string `yaml:"flux_health_required_kustomizations"`
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
WorkloadConvergenceRequiredNamespaces []string `yaml:"workload_convergence_required_namespaces"`
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
IgnoreWorkloads []string `yaml:"ignore_workloads"`
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
}
type ServiceChecklistCheck struct {

View File

@ -61,6 +61,11 @@ func (c Config) Validate() error {
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
}
for _, node := range c.Startup.NodeInventoryReachRequiredNodes {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
}
}
for node, labels := range c.Startup.RequiredNodeLabels {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
@ -233,18 +238,37 @@ func (c Config) Validate() error {
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
}
for _, node := range c.Startup.NodeSSHAuthRequiredNodes {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.node_ssh_auth_required_nodes entries must not be empty")
}
}
if c.Startup.FluxHealthWaitSeconds <= 0 {
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
}
if c.Startup.FluxHealthPollSeconds <= 0 {
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
}
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
item = strings.TrimSpace(item)
if item == "" {
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must not be empty")
}
if strings.Count(item, "/") != 1 {
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must be namespace/name, got %q", item)
}
}
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
}
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
}
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
if strings.TrimSpace(ns) == "" {
return fmt.Errorf("config.startup.workload_convergence_required_namespaces entries must not be empty")
}
}
if c.Startup.StuckPodGraceSeconds <= 0 {
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
}
@ -277,6 +301,16 @@ func (c Config) Validate() error {
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
}
}
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
if containsTrimmed(c.Startup.IgnoreFluxKustomizations, item) {
return fmt.Errorf("config.startup.flux_health_required_kustomizations must not overlap ignore_flux_kustomizations (%q)", strings.TrimSpace(item))
}
}
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
if containsTrimmed(c.Startup.IgnoreWorkloadNamespaces, ns) {
return fmt.Errorf("config.startup.workload_convergence_required_namespaces must not overlap ignore_workload_namespaces (%q)", strings.TrimSpace(ns))
}
}
for _, node := range c.Startup.IgnoreUnavailableNodes {
if strings.TrimSpace(node) == "" {
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
@ -328,3 +362,20 @@ func (c Config) Validate() error {
}
return nil
}
// containsTrimmed runs one orchestration or CLI step.
// Signature: containsTrimmed(entries []string, needle string) bool.
// Why: startup config now supports both required and ignored recovery scopes, so
// validation needs a single normalized overlap check for those lists.
func containsTrimmed(entries []string, needle string) bool {
needle = strings.TrimSpace(needle)
if needle == "" {
return false
}
for _, entry := range entries {
if strings.TrimSpace(entry) == needle {
return true
}
}
return false
}

View File

@ -30,6 +30,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
@ -68,15 +69,27 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
{"bad_empty_node_ssh_required_node", func(c *Config) { c.Startup.NodeSSHAuthRequiredNodes = []string{"titan-0a", ""} }},
{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
{"bad_empty_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", ""} }},
{"bad_malformed_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system-core"} }},
{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
{"bad_overlap_flux_required_and_ignored", func(c *Config) {
c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core"}
c.Startup.IgnoreFluxKustomizations = []string{"flux-system/core"}
}},
{"bad_overlap_workload_required_and_ignored", func(c *Config) {
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
c.Startup.IgnoreWorkloadNamespaces = []string{"monitoring"}
}},
{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
@ -121,6 +134,10 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) {
if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
t.Fatalf("expected startup defaults to be set")
}
if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
t.Fatalf("expected startup recovery scope slices to be initialized")
}
if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
t.Fatalf("expected critical service endpoint timing defaults to be set")
}

View File

@ -79,6 +79,29 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
}
})
t.Run("flux-health-required-kustomizations-ignore-optional-failures", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/monitoring"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
return `{"items":[
{"metadata":{"namespace":"flux-system","name":"monitoring"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}},
{"metadata":{"namespace":"flux-system","name":"jellyfin"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"optional still down"}]}}
]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
ready, detail, err := orch.TestHookFluxHealthReady(context.Background())
if err != nil || !ready || !strings.Contains(detail, "required kustomizations ready=") {
t.Fatalf("expected required flux kustomization scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
}
})
t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
@ -145,6 +168,42 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
}
})
t.Run("required-workload-namespaces-ignore-optional-failure-pods", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
cfg.Startup.StuckPodGraceSeconds = 1
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
return `{"items":[
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
{"kind":"Deployment","metadata":{"namespace":"jellyfin","name":"pegasus"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
]}`, nil
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[
{"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"grafana"}]},"status":{"containerStatuses":[{"state":{"running":{}}}]}},
{"metadata":{"namespace":"jellyfin","name":"pegasus-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"pegasus"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"pegasus"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}
]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
ready, detail, err := orch.TestHookWorkloadConvergenceReady(context.Background())
if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
t.Fatalf("expected required workload namespace scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
}
failures, err := orch.TestHookStartupFailurePods(context.Background())
if err != nil {
t.Fatalf("startup failure pod query: %v", err)
}
if len(failures) != 0 {
t.Fatalf("expected optional namespace failures to be ignored, got %v", failures)
}
})
t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {

View File

@ -53,6 +53,48 @@ func TestHookIngressServiceMatrix(t *testing.T) {
}
})
t.Run("required-node-labels-skip-ignored-unavailable-nodes", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
}
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-09"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
t.Fatalf("expected ignored unavailable node labels to be skipped, got %q", command)
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
t.Fatalf("expected ignored unavailable node label enforcement to be skipped, got %v", err)
}
})
t.Run("required-node-labels-skip-absent-non-core-nodes", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-09": {
"ananke.bstein.dev/harbor-bootstrap": "true",
},
}
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
return "", errors.New("Error from server (NotFound): nodes \"titan-09\" not found")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
t.Fatalf("expected absent non-core node label enforcement to be skipped, got %v", err)
}
})
t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)

View File

@ -633,7 +633,7 @@ func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
apiVersionCalls++
if apiVersionCalls == 1 {
if apiVersionCalls <= 2 {
return "", errors.New("api down")
}
return "v1.31.0", nil

View File

@ -0,0 +1,222 @@
package orchestrator
import (
"context"
"errors"
"os"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
)
// readStartupProgress runs one orchestration or CLI step.
// Signature: readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string.
// Why: startup helper tests need to inspect progress artifacts without reaching
// into internal package state from the top-level testing module.
func readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string {
t.Helper()
payload, err := os.ReadFile(orch.TestHookStartupProgressPath())
if err != nil {
t.Fatalf("read startup progress: %v", err)
}
return string(payload)
}
// TestHookStartupScopeAndVaultHelpers runs one orchestration or CLI step.
// Signature: TestHookStartupScopeAndVaultHelpers(t *testing.T).
// Why: keeps startup-scope and startup-Vault helper branches covered from the
// split top-level testing module required by the repo hygiene contract.
func TestHookStartupScopeAndVaultHelpers(t *testing.T) {
t.Run("startup-scope-helpers", func(t *testing.T) {
nodes := []string{"titan-db", " titan-23 ", "", "titan-24"}
if got := cluster.TestHookStartupRequiredNodes(nodes, nil); len(got) != len(nodes) {
t.Fatalf("expected passthrough node list, got %v", got)
}
got := cluster.TestHookStartupRequiredNodes(nodes, []string{"titan-db", "titan-24"})
if len(got) != 2 || got[0] != "titan-db" || got[1] != "titan-24" {
t.Fatalf("unexpected filtered node list: %v", got)
}
if !cluster.TestHookContainsNode([]string{"titan-db", " titan-23 "}, " titan-23 ") {
t.Fatalf("expected trimmed node membership match")
}
if cluster.TestHookContainsNode([]string{"titan-db"}, " ") {
t.Fatalf("expected blank node probe to be ignored")
}
cfg := lifecycleConfig(t)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
if !orch.TestHookStartupNodeStrictlyRequired("titan-23") {
t.Fatalf("expected all nodes to be strict when no recovery scopes are configured")
}
cfgScoped := lifecycleConfig(t)
cfgScoped.Startup.NodeInventoryReachRequiredNodes = []string{"titan-23"}
cfgScoped.Startup.NodeSSHAuthRequiredNodes = []string{"titan-24"}
cfgScoped.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", " flux-system/gitea "}
cfgScoped.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault", " monitoring "}
orchScoped, _ := newHookOrchestrator(t, cfgScoped, nil, nil)
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-db") {
t.Fatalf("expected control plane to remain strict")
}
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-23") {
t.Fatalf("expected inventory-scoped node to remain strict")
}
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-24") {
t.Fatalf("expected ssh-scoped node to remain strict")
}
if orchScoped.TestHookStartupNodeStrictlyRequired("titan-25") {
t.Fatalf("expected non-core worker to stop being strict")
}
flux := orchScoped.TestHookStartupRequiredFluxKustomizations()
if _, ok := flux["flux-system/core"]; !ok {
t.Fatalf("expected core flux kustomization in required set: %v", flux)
}
if _, ok := flux["flux-system/gitea"]; !ok {
t.Fatalf("expected trimmed gitea kustomization in required set: %v", flux)
}
namespaces := orchScoped.TestHookStartupRequiredWorkloadNamespaces()
if _, ok := namespaces["vault"]; !ok {
t.Fatalf("expected vault namespace in required set: %v", namespaces)
}
if _, ok := namespaces["monitoring"]; !ok {
t.Fatalf("expected trimmed monitoring namespace in required set: %v", namespaces)
}
})
t.Run("startup-vault-helpers", func(t *testing.T) {
t.Run("early-vault-unseal-paths", func(t *testing.T) {
cfgAPI := lifecycleConfig(t)
runAPI := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
return "", errors.New("api down")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchAPI, _ := newHookOrchestrator(t, cfgAPI, runAPI, runAPI)
orchAPI.TestHookBeginStartupReport("startup-vault")
orchAPI.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchAPI); strings.Contains(payload, "vault-unseal-early") {
t.Fatalf("expected no early vault check when api is unavailable, payload=%s", payload)
}
cfgErr := lifecycleConfig(t)
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "v1.31.0", nil
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "", errors.New("phase probe failed")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
orchErr.TestHookBeginStartupReport("startup-vault")
orchErr.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchErr); !strings.Contains(payload, "deferred early vault unseal") {
t.Fatalf("expected early vault auto-heal detail, payload=%s", payload)
}
cfgDeferred := lifecycleConfig(t)
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "v1.31.0", nil
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Pending", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
orchDeferred.TestHookBeginStartupReport("startup-vault")
orchDeferred.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
t.Fatalf("expected deferred early vault detail, payload=%s", payload)
}
cfgSuccess := lifecycleConfig(t)
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "v1.31.0", nil
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
return `{"sealed":false,"initialized":true}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
orchSuccess.TestHookBeginStartupReport("startup-vault")
orchSuccess.TestHookMaybeRunEarlyVaultUnseal(context.Background())
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault-unseal-early"`) || !strings.Contains(payload, `"passed"`) {
t.Fatalf("expected successful early vault check, payload=%s", payload)
}
})
t.Run("startup-vault-gate-paths", func(t *testing.T) {
cfgErr := lifecycleConfig(t)
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
return "", errors.New("phase probe failed")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
orchErr.TestHookBeginStartupReport("startup-vault")
if err := orchErr.TestHookRunStartupVaultUnsealGate(context.Background()); err == nil || !strings.Contains(err.Error(), "phase probe failed") {
t.Fatalf("expected startup vault gate error, got %v", err)
}
cfgDeferred := lifecycleConfig(t)
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
return "Pending", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
orchDeferred.TestHookBeginStartupReport("startup-vault")
if err := orchDeferred.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
t.Fatalf("expected deferred startup vault gate to succeed, got %v", err)
}
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
t.Fatalf("expected deferred startup vault detail, payload=%s", payload)
}
cfgSuccess := lifecycleConfig(t)
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
return "Running", nil
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
return `{"sealed":false,"initialized":true}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
orchSuccess.TestHookBeginStartupReport("startup-vault")
if err := orchSuccess.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
t.Fatalf("expected successful startup vault gate, got %v", err)
}
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault is unsealed"`) {
t.Fatalf("expected successful startup vault detail, payload=%s", payload)
}
})
})
}

View File

@ -165,6 +165,32 @@ func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
}
})
t.Run("wait-for-node-gates-only-require-core-nodes", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequireNodeSSHAuth = true
cfg.Startup.NodeSSHAuthWaitSeconds = 1
cfg.Startup.NodeSSHAuthPollSeconds = 1
cfg.Startup.NodeInventoryReachWaitSeconds = 1
cfg.Startup.NodeInventoryReachPollSeconds = 1
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
cfg.Startup.NodeSSHAuthRequiredNodes = []string{"titan-db"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "__ANANKE_") {
return "", errors.New("no route to host")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
t.Fatalf("expected ssh-auth gate to ignore non-core worker failure, got %v", err)
}
if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err != nil {
t.Fatalf("expected inventory reachability gate to ignore non-core worker failure, got %v", err)
}
})
})
t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {