startup(ananke): scope emergency recovery to core services
This commit is contained in:
parent
a3e24b9b15
commit
1f656de5df
14
README.md
14
README.md
@ -97,10 +97,15 @@ Primary config path:
|
||||
Keep these fields accurate:
|
||||
- `expected_flux_source_url`
|
||||
- `expected_flux_branch`
|
||||
- `startup.service_checklist_explicit_only`
|
||||
- `startup.service_checklist`
|
||||
- `startup.critical_service_endpoints`
|
||||
- `startup.require_ingress_checklist`
|
||||
- `startup.require_node_inventory_reachability`
|
||||
- `startup.node_inventory_reachability_required_nodes`
|
||||
- `startup.node_ssh_auth_required_nodes`
|
||||
- `startup.flux_health_required_kustomizations`
|
||||
- `startup.workload_convergence_required_namespaces`
|
||||
- `startup.ignore_unavailable_nodes`
|
||||
- `coordination.role`
|
||||
- `coordination.peer_hosts`
|
||||
@ -134,9 +139,10 @@ Installer behavior:
|
||||
|
||||
When adding nodes or services:
|
||||
1. Update inventory and node mapping in config.
|
||||
2. Add/adjust service checklist entries for anything user-facing or critical.
|
||||
3. Add/adjust ingress expectations for exposed services.
|
||||
4. Use temporary ignores only when truly intentional, then remove them.
|
||||
5. Run `scripts/quality_gate.sh` before host deployment.
|
||||
2. Keep the explicit service checklist focused on the core services that must come back during an outage.
|
||||
3. Keep `*_required_*` startup scopes aligned with the same core set so optional stacks do not block bootstrap.
|
||||
4. Add/adjust ingress expectations for exposed services.
|
||||
5. Use temporary ignores only when truly intentional, then remove them.
|
||||
6. Run `scripts/quality_gate.sh` before host deployment.
|
||||
|
||||
Recovery quality should improve over time: every drill should reduce manual work in the next drill.
|
||||
|
||||
@ -51,6 +51,7 @@ startup:
|
||||
require_node_inventory_reachability: true
|
||||
node_inventory_reachability_wait_seconds: 300
|
||||
node_inventory_reachability_poll_seconds: 5
|
||||
node_inventory_reachability_required_nodes: []
|
||||
required_node_labels:
|
||||
titan-09:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
@ -90,6 +91,7 @@ startup:
|
||||
admin_secret_name: keycloak-admin
|
||||
admin_secret_username_key: username
|
||||
admin_secret_password_key: password
|
||||
service_checklist_explicit_only: false
|
||||
service_checklist:
|
||||
- name: gitea-api
|
||||
url: https://scm.bstein.dev/api/healthz
|
||||
@ -134,13 +136,16 @@ startup:
|
||||
require_node_ssh_auth: true
|
||||
node_ssh_auth_wait_seconds: 240
|
||||
node_ssh_auth_poll_seconds: 5
|
||||
node_ssh_auth_required_nodes: []
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
flux_health_required_kustomizations: []
|
||||
ignore_flux_kustomizations: []
|
||||
require_workload_convergence: true
|
||||
workload_convergence_wait_seconds: 900
|
||||
workload_convergence_poll_seconds: 5
|
||||
workload_convergence_required_namespaces: []
|
||||
ignore_workload_namespaces: []
|
||||
ignore_workloads: []
|
||||
ignore_unavailable_nodes: []
|
||||
|
||||
@ -117,6 +117,10 @@ startup:
|
||||
require_node_inventory_reachability: true
|
||||
node_inventory_reachability_wait_seconds: 300
|
||||
node_inventory_reachability_poll_seconds: 5
|
||||
node_inventory_reachability_required_nodes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
required_node_labels:
|
||||
titan-09:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
@ -156,6 +160,7 @@ startup:
|
||||
admin_secret_name: keycloak-admin
|
||||
admin_secret_username_key: username
|
||||
admin_secret_password_key: password
|
||||
service_checklist_explicit_only: true
|
||||
service_checklist:
|
||||
- name: gitea-api
|
||||
url: https://scm.bstein.dev/api/healthz
|
||||
@ -200,13 +205,39 @@ startup:
|
||||
require_node_ssh_auth: true
|
||||
node_ssh_auth_wait_seconds: 240
|
||||
node_ssh_auth_poll_seconds: 5
|
||||
node_ssh_auth_required_nodes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
flux_health_required_kustomizations:
|
||||
- flux-system/core
|
||||
- flux-system/helm
|
||||
- flux-system/traefik
|
||||
- flux-system/cert-manager
|
||||
- flux-system/longhorn
|
||||
- flux-system/vault-csi
|
||||
- flux-system/vault-injector
|
||||
- flux-system/postgres
|
||||
- flux-system/vault
|
||||
- flux-system/keycloak
|
||||
- flux-system/oauth2-proxy
|
||||
- flux-system/gitea
|
||||
- flux-system/monitoring
|
||||
- flux-system/harbor
|
||||
ignore_flux_kustomizations: []
|
||||
require_workload_convergence: true
|
||||
workload_convergence_wait_seconds: 900
|
||||
workload_convergence_poll_seconds: 5
|
||||
workload_convergence_required_namespaces:
|
||||
- vault
|
||||
- postgres
|
||||
- sso
|
||||
- gitea
|
||||
- monitoring
|
||||
- harbor
|
||||
ignore_workload_namespaces: []
|
||||
ignore_workloads: []
|
||||
ignore_unavailable_nodes: []
|
||||
|
||||
@ -117,6 +117,10 @@ startup:
|
||||
require_node_inventory_reachability: true
|
||||
node_inventory_reachability_wait_seconds: 300
|
||||
node_inventory_reachability_poll_seconds: 5
|
||||
node_inventory_reachability_required_nodes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
required_node_labels:
|
||||
titan-09:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
@ -156,6 +160,7 @@ startup:
|
||||
admin_secret_name: keycloak-admin
|
||||
admin_secret_username_key: username
|
||||
admin_secret_password_key: password
|
||||
service_checklist_explicit_only: true
|
||||
service_checklist:
|
||||
- name: gitea-api
|
||||
url: https://scm.bstein.dev/api/healthz
|
||||
@ -200,13 +205,39 @@ startup:
|
||||
require_node_ssh_auth: true
|
||||
node_ssh_auth_wait_seconds: 240
|
||||
node_ssh_auth_poll_seconds: 5
|
||||
node_ssh_auth_required_nodes:
|
||||
- titan-0a
|
||||
- titan-0b
|
||||
- titan-0c
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
flux_health_required_kustomizations:
|
||||
- flux-system/core
|
||||
- flux-system/helm
|
||||
- flux-system/traefik
|
||||
- flux-system/cert-manager
|
||||
- flux-system/longhorn
|
||||
- flux-system/vault-csi
|
||||
- flux-system/vault-injector
|
||||
- flux-system/postgres
|
||||
- flux-system/vault
|
||||
- flux-system/keycloak
|
||||
- flux-system/oauth2-proxy
|
||||
- flux-system/gitea
|
||||
- flux-system/monitoring
|
||||
- flux-system/harbor
|
||||
ignore_flux_kustomizations: []
|
||||
require_workload_convergence: true
|
||||
workload_convergence_wait_seconds: 900
|
||||
workload_convergence_poll_seconds: 5
|
||||
workload_convergence_required_namespaces:
|
||||
- vault
|
||||
- postgres
|
||||
- sso
|
||||
- gitea
|
||||
- monitoring
|
||||
- harbor
|
||||
ignore_workload_namespaces: []
|
||||
ignore_workloads: []
|
||||
ignore_unavailable_nodes: []
|
||||
|
||||
@ -77,7 +77,7 @@ func (o *Orchestrator) waitForNodeSSHAuth(ctx context.Context, nodes []string) e
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
seen := map[string]struct{}{}
|
||||
targets := make([]string, 0, len(nodes))
|
||||
for _, node := range nodes {
|
||||
for _, node := range startupRequiredNodes(nodes, o.cfg.Startup.NodeSSHAuthRequiredNodes) {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
continue
|
||||
|
||||
@ -227,6 +227,31 @@ func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) er
|
||||
return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
|
||||
}
|
||||
|
||||
// ensureVaultUnsealedWhenRunnable runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error).
|
||||
// Why: lets startup defer vault unseal until the pod is actually runnable, while
|
||||
// keeping the direct unseal helper strict for explicit recovery paths and tests.
|
||||
func (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error) {
|
||||
if o.runner.DryRun {
|
||||
return false, "", nil
|
||||
}
|
||||
|
||||
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
|
||||
if err != nil {
|
||||
if isNotFoundErr(err) {
|
||||
return true, "vault-0 pod is not present yet; deferring unseal until critical workload recovery", nil
|
||||
}
|
||||
return false, "", fmt.Errorf("vault pod phase check failed: %w", err)
|
||||
}
|
||||
|
||||
trimmedPhase := strings.TrimSpace(phase)
|
||||
if trimmedPhase != "Running" {
|
||||
return true, fmt.Sprintf("vault-0 pod phase is %q; deferring unseal until critical workload recovery", trimmedPhase), nil
|
||||
}
|
||||
|
||||
return false, "", o.ensureVaultUnsealed(ctx)
|
||||
}
|
||||
|
||||
// ensureVaultUnsealed runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
|
||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||
|
||||
@ -143,6 +143,8 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
||||
return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
|
||||
}
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
|
||||
required := o.startupRequiredFluxKustomizations()
|
||||
requiredSeen := map[string]struct{}{}
|
||||
notReady := []string{}
|
||||
for _, ks := range list.Items {
|
||||
ns := strings.TrimSpace(ks.Metadata.Namespace)
|
||||
@ -154,6 +156,12 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
||||
if ks.Spec.Suspend {
|
||||
continue
|
||||
}
|
||||
if len(required) > 0 {
|
||||
if _, ok := required[full]; !ok {
|
||||
continue
|
||||
}
|
||||
requiredSeen[full] = struct{}{}
|
||||
}
|
||||
if _, ok := ignored[full]; ok {
|
||||
continue
|
||||
}
|
||||
@ -173,10 +181,25 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
||||
}
|
||||
notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
|
||||
}
|
||||
if len(required) > 0 {
|
||||
missing := []string{}
|
||||
for full := range required {
|
||||
if _, ok := requiredSeen[full]; !ok {
|
||||
missing = append(missing, full+"(missing)")
|
||||
}
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
sort.Strings(missing)
|
||||
notReady = append(notReady, missing...)
|
||||
}
|
||||
}
|
||||
if len(notReady) > 0 {
|
||||
sort.Strings(notReady)
|
||||
return false, "not ready: " + joinLimited(notReady, 6), nil
|
||||
}
|
||||
if len(required) > 0 {
|
||||
return true, fmt.Sprintf("required kustomizations ready=%d", len(requiredSeen)), nil
|
||||
}
|
||||
return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
|
||||
}
|
||||
|
||||
|
||||
@ -19,6 +19,7 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
||||
if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
|
||||
return nil
|
||||
}
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
|
||||
for node := range o.cfg.Startup.RequiredNodeLabels {
|
||||
node = strings.TrimSpace(node)
|
||||
@ -28,6 +29,10 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
||||
}
|
||||
sort.Strings(nodes)
|
||||
for _, node := range nodes {
|
||||
if _, skip := ignored[node]; skip {
|
||||
o.log.Printf("skipping required node labels for ignored unavailable node %s", node)
|
||||
continue
|
||||
}
|
||||
labels := o.cfg.Startup.RequiredNodeLabels[node]
|
||||
if len(labels) == 0 {
|
||||
continue
|
||||
@ -55,6 +60,11 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
||||
continue
|
||||
}
|
||||
if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
|
||||
if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {
|
||||
o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)
|
||||
o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))
|
||||
continue
|
||||
}
|
||||
return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
|
||||
}
|
||||
o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))
|
||||
|
||||
@ -37,14 +37,7 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
return invErr
|
||||
}
|
||||
o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
|
||||
if err := o.waitForAPI(ctx, 1, time.Second); err == nil {
|
||||
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed while kubernetes api is already available")
|
||||
if err := o.ensureVaultUnsealed(ctx); err != nil {
|
||||
o.noteStartupCheck("vault-unseal", false, err.Error())
|
||||
return err
|
||||
}
|
||||
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
|
||||
}
|
||||
o.maybeRunEarlyVaultUnseal(ctx)
|
||||
o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
|
||||
if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
|
||||
o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
|
||||
@ -187,12 +180,9 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
}
|
||||
}
|
||||
o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
|
||||
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
|
||||
if err := o.ensureVaultUnsealed(ctx); err != nil {
|
||||
o.noteStartupCheck("vault-unseal", false, err.Error())
|
||||
if err := o.runStartupVaultUnsealGate(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
|
||||
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -490,18 +480,3 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
|
||||
o.log.Printf("shutdown flow complete")
|
||||
return nil
|
||||
}
|
||||
|
||||
// normalizeShutdownMode runs one orchestration or CLI step.
|
||||
// Signature: normalizeShutdownMode(raw string) (string, error).
|
||||
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
|
||||
// semantics while preserving compatibility with legacy "config" callers.
|
||||
func normalizeShutdownMode(raw string) (string, error) {
|
||||
switch strings.TrimSpace(raw) {
|
||||
case "", "config", "cluster-only":
|
||||
return "cluster-only", nil
|
||||
case "poweroff":
|
||||
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
|
||||
default:
|
||||
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
|
||||
}
|
||||
}
|
||||
|
||||
@ -30,7 +30,7 @@ func (o *Orchestrator) waitForNodeInventoryReachability(ctx context.Context) err
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
targets := make([]string, 0, len(o.inventoryNodesForValidation()))
|
||||
seen := map[string]struct{}{}
|
||||
for _, node := range o.inventoryNodesForValidation() {
|
||||
for _, node := range startupRequiredNodes(o.inventoryNodesForValidation(), o.cfg.Startup.NodeInventoryReachRequiredNodes) {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
continue
|
||||
|
||||
21
internal/cluster/orchestrator_shutdown_mode.go
Normal file
21
internal/cluster/orchestrator_shutdown_mode.go
Normal file
@ -0,0 +1,21 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// normalizeShutdownMode runs one orchestration or CLI step.
|
||||
// Signature: normalizeShutdownMode(raw string) (string, error).
|
||||
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
|
||||
// semantics while preserving compatibility with legacy "config" callers.
|
||||
func normalizeShutdownMode(raw string) (string, error) {
|
||||
switch strings.TrimSpace(raw) {
|
||||
case "", "config", "cluster-only":
|
||||
return "cluster-only", nil
|
||||
case "poweroff":
|
||||
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
|
||||
default:
|
||||
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
|
||||
}
|
||||
}
|
||||
81
internal/cluster/orchestrator_startup_scope.go
Normal file
81
internal/cluster/orchestrator_startup_scope.go
Normal file
@ -0,0 +1,81 @@
|
||||
package cluster
|
||||
|
||||
import "strings"
|
||||
|
||||
// startupRequiredNodes runs one orchestration or CLI step.
|
||||
// Signature: startupRequiredNodes(nodes []string, required []string) []string.
|
||||
// Why: lets startup enforce a smaller core node set during outage recovery
|
||||
// without losing the stricter all-nodes behavior when no override is configured.
|
||||
func startupRequiredNodes(nodes []string, required []string) []string {
|
||||
requiredSet := makeStringSet(required)
|
||||
if len(requiredSet) == 0 {
|
||||
return nodes
|
||||
}
|
||||
filtered := make([]string, 0, len(nodes))
|
||||
for _, node := range nodes {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := requiredSet[node]; ok {
|
||||
filtered = append(filtered, node)
|
||||
}
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
// startupNodeStrictlyRequired runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) startupNodeStrictlyRequired(node string) bool.
|
||||
// Why: absent or broken non-core nodes should not block recovery-only actions
|
||||
// like label reconciliation once the operator has narrowed startup to core nodes.
|
||||
func (o *Orchestrator) startupNodeStrictlyRequired(node string) bool {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" {
|
||||
return false
|
||||
}
|
||||
if len(o.cfg.Startup.NodeInventoryReachRequiredNodes) == 0 && len(o.cfg.Startup.NodeSSHAuthRequiredNodes) == 0 {
|
||||
return true
|
||||
}
|
||||
for _, controlPlane := range o.cfg.ControlPlanes {
|
||||
if strings.TrimSpace(controlPlane) == node {
|
||||
return true
|
||||
}
|
||||
}
|
||||
if containsNode(o.cfg.Startup.NodeInventoryReachRequiredNodes, node) {
|
||||
return true
|
||||
}
|
||||
return containsNode(o.cfg.Startup.NodeSSHAuthRequiredNodes, node)
|
||||
}
|
||||
|
||||
// startupRequiredFluxKustomizations runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{}.
|
||||
// Why: lets outage recovery wait on a declared core GitOps slice while leaving
|
||||
// optional stacks free to converge after bootstrap succeeds.
|
||||
func (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{} {
|
||||
return makeStringSet(o.cfg.Startup.FluxHealthRequiredKustomizations)
|
||||
}
|
||||
|
||||
// startupRequiredWorkloadNamespaces runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{}.
|
||||
// Why: keeps workload readiness scoped to core namespaces during recovery while
|
||||
// preserving broad convergence checks when no explicit core list is configured.
|
||||
func (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{} {
|
||||
return makeStringSet(o.cfg.Startup.WorkloadConvergenceRequiredNamespaces)
|
||||
}
|
||||
|
||||
// containsNode runs one orchestration or CLI step.
|
||||
// Signature: containsNode(entries []string, needle string) bool.
|
||||
// Why: keeps node-scope checks small and explicit anywhere startup narrows its
|
||||
// recovery gates to a declared core set.
|
||||
func containsNode(entries []string, needle string) bool {
|
||||
needle = strings.TrimSpace(needle)
|
||||
if needle == "" {
|
||||
return false
|
||||
}
|
||||
for _, entry := range entries {
|
||||
if strings.TrimSpace(entry) == needle {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
52
internal/cluster/orchestrator_startup_vault.go
Normal file
52
internal/cluster/orchestrator_startup_vault.go
Normal file
@ -0,0 +1,52 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// maybeRunEarlyVaultUnseal runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context).
|
||||
// Why: gives startup a best-effort Vault recovery path when the API is already
|
||||
// live, without consuming the hard startup failure path before workloads recover.
|
||||
func (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context) {
|
||||
if err := o.waitForAPI(ctx, 1, time.Second); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
o.noteStartupCheckState("vault-unseal-early", "running", "best-effort early vault unseal while kubernetes api is already available")
|
||||
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
|
||||
if err != nil {
|
||||
o.log.Printf("warning: early vault unseal deferred: %v", err)
|
||||
o.noteStartupAutoHeal(fmt.Sprintf("deferred early vault unseal: %v", err))
|
||||
return
|
||||
}
|
||||
if deferred {
|
||||
o.log.Printf("vault early unseal deferred: %s", detail)
|
||||
o.noteStartupAutoHeal(detail)
|
||||
return
|
||||
}
|
||||
o.noteStartupCheck("vault-unseal-early", true, "vault is already unsealed")
|
||||
}
|
||||
|
||||
// runStartupVaultUnsealGate runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error.
|
||||
// Why: keeps the top-level startup flow readable while allowing Vault unseal to
|
||||
// defer cleanly until critical workload recovery when the pod is not runnable yet.
|
||||
func (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error {
|
||||
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
|
||||
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
|
||||
if err != nil {
|
||||
o.noteStartupCheck("vault-unseal", false, err.Error())
|
||||
return err
|
||||
}
|
||||
if deferred {
|
||||
o.log.Printf("vault unseal deferred until workload recovery: %s", detail)
|
||||
o.noteStartupAutoHeal(detail)
|
||||
o.noteStartupCheck("vault-unseal", true, detail)
|
||||
return nil
|
||||
}
|
||||
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
|
||||
return nil
|
||||
}
|
||||
@ -71,6 +71,7 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
|
||||
if err := json.Unmarshal([]byte(out), &list); err != nil {
|
||||
return false, "", fmt.Errorf("decode controllers: %w", err)
|
||||
}
|
||||
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
|
||||
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
||||
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
|
||||
@ -84,6 +85,11 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
|
||||
if kind == "" || ns == "" || name == "" {
|
||||
continue
|
||||
}
|
||||
if len(requiredNamespaces) > 0 {
|
||||
if _, ok := requiredNamespaces[ns]; !ok {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if _, ok := ignoredNamespaces[ns]; ok {
|
||||
continue
|
||||
}
|
||||
|
||||
@ -116,6 +116,7 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
||||
return nil, fmt.Errorf("decode pods: %w", err)
|
||||
}
|
||||
|
||||
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
|
||||
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
||||
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
stuckReasons := map[string]struct{}{
|
||||
@ -138,6 +139,11 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
||||
if ns == "" || name == "" {
|
||||
continue
|
||||
}
|
||||
if len(requiredNamespaces) > 0 {
|
||||
if _, ok := requiredNamespaces[ns]; !ok {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if _, ok := ignoredNamespaces[ns]; ok {
|
||||
continue
|
||||
}
|
||||
|
||||
55
internal/cluster/testing_hooks_startup.go
Normal file
55
internal/cluster/testing_hooks_startup.go
Normal file
@ -0,0 +1,55 @@
|
||||
package cluster
|
||||
|
||||
import "context"
|
||||
|
||||
// TestHookStartupRequiredNodes runs one orchestration or CLI step.
|
||||
// Signature: TestHookStartupRequiredNodes(nodes []string, required []string) []string.
|
||||
// Why: exposes recovery-scope node filtering so top-level tests can cover core-only startup narrowing.
|
||||
func TestHookStartupRequiredNodes(nodes []string, required []string) []string {
|
||||
return startupRequiredNodes(nodes, required)
|
||||
}
|
||||
|
||||
// TestHookContainsNode runs one orchestration or CLI step.
|
||||
// Signature: TestHookContainsNode(entries []string, needle string) bool.
|
||||
// Why: exposes the small startup-scope membership helper to top-level tests.
|
||||
func TestHookContainsNode(entries []string, needle string) bool {
|
||||
return containsNode(entries, needle)
|
||||
}
|
||||
|
||||
// TestHookStartupNodeStrictlyRequired runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool.
|
||||
// Why: exposes strict-node startup scoping so outage-recovery tests can confirm
|
||||
// non-core nodes stop blocking bootstrap.
|
||||
func (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool {
|
||||
return o.startupNodeStrictlyRequired(node)
|
||||
}
|
||||
|
||||
// TestHookStartupRequiredFluxKustomizations runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{}.
|
||||
// Why: exposes flux startup scoping so top-level tests can confirm only core
|
||||
// kustomizations block emergency bootstrap.
|
||||
func (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{} {
|
||||
return o.startupRequiredFluxKustomizations()
|
||||
}
|
||||
|
||||
// TestHookStartupRequiredWorkloadNamespaces runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{}.
|
||||
// Why: exposes workload namespace startup scoping so top-level tests can
|
||||
// confirm only core workloads block emergency bootstrap.
|
||||
func (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{} {
|
||||
return o.startupRequiredWorkloadNamespaces()
|
||||
}
|
||||
|
||||
// TestHookMaybeRunEarlyVaultUnseal runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context).
|
||||
// Why: exposes the early startup Vault deferral helper to top-level tests.
|
||||
func (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context) {
|
||||
o.maybeRunEarlyVaultUnseal(ctx)
|
||||
}
|
||||
|
||||
// TestHookRunStartupVaultUnsealGate runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error.
|
||||
// Why: exposes the startup Vault gate helper to top-level tests.
|
||||
func (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error {
|
||||
return o.runStartupVaultUnsealGate(ctx)
|
||||
}
|
||||
@ -33,6 +33,9 @@ func (c *Config) applyDefaults() {
|
||||
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
||||
c.Startup.NodeInventoryReachPollSeconds = 5
|
||||
}
|
||||
if c.Startup.NodeInventoryReachRequiredNodes == nil {
|
||||
c.Startup.NodeInventoryReachRequiredNodes = []string{}
|
||||
}
|
||||
if c.Startup.RequiredNodeLabels == nil {
|
||||
c.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||
"titan-09": {
|
||||
@ -121,7 +124,11 @@ func (c *Config) applyDefaults() {
|
||||
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
|
||||
c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
|
||||
}
|
||||
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
|
||||
if c.Startup.ServiceChecklistExplicitOnly {
|
||||
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, []ServiceChecklistCheck{})
|
||||
} else {
|
||||
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
|
||||
}
|
||||
for i := range c.Startup.ServiceChecklist {
|
||||
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
|
||||
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
|
||||
@ -152,12 +159,18 @@ func (c *Config) applyDefaults() {
|
||||
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
||||
c.Startup.NodeSSHAuthPollSeconds = 5
|
||||
}
|
||||
if c.Startup.NodeSSHAuthRequiredNodes == nil {
|
||||
c.Startup.NodeSSHAuthRequiredNodes = []string{}
|
||||
}
|
||||
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||
c.Startup.FluxHealthWaitSeconds = 900
|
||||
}
|
||||
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||
c.Startup.FluxHealthPollSeconds = 5
|
||||
}
|
||||
if c.Startup.FluxHealthRequiredKustomizations == nil {
|
||||
c.Startup.FluxHealthRequiredKustomizations = []string{}
|
||||
}
|
||||
if c.Startup.IgnoreFluxKustomizations == nil {
|
||||
c.Startup.IgnoreFluxKustomizations = []string{}
|
||||
}
|
||||
@ -167,6 +180,9 @@ func (c *Config) applyDefaults() {
|
||||
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||
c.Startup.WorkloadConvergencePollSeconds = 5
|
||||
}
|
||||
if c.Startup.WorkloadConvergenceRequiredNamespaces == nil {
|
||||
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{}
|
||||
}
|
||||
if c.Startup.IgnoreWorkloadNamespaces == nil {
|
||||
c.Startup.IgnoreWorkloadNamespaces = []string{}
|
||||
}
|
||||
|
||||
@ -39,24 +39,25 @@ func defaults() Config {
|
||||
"maintenance",
|
||||
},
|
||||
Startup: Startup{
|
||||
APIWaitSeconds: 1200,
|
||||
APIPollSeconds: 2,
|
||||
ShutdownCooldownSeconds: 45,
|
||||
RequireNodeInventoryReach: true,
|
||||
NodeInventoryReachWaitSeconds: 300,
|
||||
NodeInventoryReachPollSeconds: 5,
|
||||
RequireTimeSync: true,
|
||||
TimeSyncWaitSeconds: 240,
|
||||
TimeSyncPollSeconds: 5,
|
||||
TimeSyncMode: "quorum",
|
||||
TimeSyncQuorum: 2,
|
||||
ReconcileAccessOnBoot: true,
|
||||
AutoEtcdRestoreOnAPIFailure: true,
|
||||
EtcdRestoreControlPlane: "titan-0a",
|
||||
RequireStorageReady: true,
|
||||
StorageReadyWaitSeconds: 420,
|
||||
StorageReadyPollSeconds: 5,
|
||||
StorageMinReadyNodes: 2,
|
||||
APIWaitSeconds: 1200,
|
||||
APIPollSeconds: 2,
|
||||
ShutdownCooldownSeconds: 45,
|
||||
RequireNodeInventoryReach: true,
|
||||
NodeInventoryReachWaitSeconds: 300,
|
||||
NodeInventoryReachPollSeconds: 5,
|
||||
NodeInventoryReachRequiredNodes: []string{},
|
||||
RequireTimeSync: true,
|
||||
TimeSyncWaitSeconds: 240,
|
||||
TimeSyncPollSeconds: 5,
|
||||
TimeSyncMode: "quorum",
|
||||
TimeSyncQuorum: 2,
|
||||
ReconcileAccessOnBoot: true,
|
||||
AutoEtcdRestoreOnAPIFailure: true,
|
||||
EtcdRestoreControlPlane: "titan-0a",
|
||||
RequireStorageReady: true,
|
||||
StorageReadyWaitSeconds: 420,
|
||||
StorageReadyPollSeconds: 5,
|
||||
StorageMinReadyNodes: 2,
|
||||
StorageCriticalPVCs: []string{
|
||||
"vault/data-vault-0",
|
||||
"postgres/postgres-data-postgres-0",
|
||||
@ -91,33 +92,36 @@ func defaults() Config {
|
||||
AdminSecretUsernameKey: "username",
|
||||
AdminSecretPasswordKey: "password",
|
||||
},
|
||||
ServiceChecklist: defaultServiceChecklist(),
|
||||
RequireCriticalServiceEndpoints: true,
|
||||
CriticalServiceEndpointWaitSec: 420,
|
||||
CriticalServiceEndpointPollSec: 5,
|
||||
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
|
||||
RequireIngressChecklist: true,
|
||||
IngressChecklistWaitSeconds: 420,
|
||||
IngressChecklistPollSeconds: 5,
|
||||
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
|
||||
IngressChecklistIgnoreHosts: []string{},
|
||||
RequireNodeSSHAuth: true,
|
||||
NodeSSHAuthWaitSeconds: 240,
|
||||
NodeSSHAuthPollSeconds: 5,
|
||||
RequireFluxHealth: true,
|
||||
FluxHealthWaitSeconds: 900,
|
||||
FluxHealthPollSeconds: 5,
|
||||
IgnoreFluxKustomizations: []string{},
|
||||
RequireWorkloadConvergence: true,
|
||||
WorkloadConvergenceWaitSeconds: 900,
|
||||
WorkloadConvergencePollSeconds: 5,
|
||||
IgnoreWorkloadNamespaces: []string{},
|
||||
IgnoreWorkloads: []string{},
|
||||
IgnoreUnavailableNodes: []string{},
|
||||
AutoRecycleStuckPods: true,
|
||||
StuckPodGraceSeconds: 180,
|
||||
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
|
||||
VaultUnsealBreakglassTimeout: 15,
|
||||
ServiceChecklist: defaultServiceChecklist(),
|
||||
RequireCriticalServiceEndpoints: true,
|
||||
CriticalServiceEndpointWaitSec: 420,
|
||||
CriticalServiceEndpointPollSec: 5,
|
||||
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
|
||||
RequireIngressChecklist: true,
|
||||
IngressChecklistWaitSeconds: 420,
|
||||
IngressChecklistPollSeconds: 5,
|
||||
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
|
||||
IngressChecklistIgnoreHosts: []string{},
|
||||
RequireNodeSSHAuth: true,
|
||||
NodeSSHAuthWaitSeconds: 240,
|
||||
NodeSSHAuthPollSeconds: 5,
|
||||
NodeSSHAuthRequiredNodes: []string{},
|
||||
RequireFluxHealth: true,
|
||||
FluxHealthWaitSeconds: 900,
|
||||
FluxHealthPollSeconds: 5,
|
||||
FluxHealthRequiredKustomizations: []string{},
|
||||
IgnoreFluxKustomizations: []string{},
|
||||
RequireWorkloadConvergence: true,
|
||||
WorkloadConvergenceWaitSeconds: 900,
|
||||
WorkloadConvergencePollSeconds: 5,
|
||||
WorkloadConvergenceRequiredNamespaces: []string{},
|
||||
IgnoreWorkloadNamespaces: []string{},
|
||||
IgnoreWorkloads: []string{},
|
||||
IgnoreUnavailableNodes: []string{},
|
||||
AutoRecycleStuckPods: true,
|
||||
StuckPodGraceSeconds: 180,
|
||||
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
|
||||
VaultUnsealBreakglassTimeout: 15,
|
||||
},
|
||||
Shutdown: Shutdown{
|
||||
DefaultBudgetSeconds: 1380,
|
||||
|
||||
@ -51,3 +51,41 @@ startup:
|
||||
t.Fatalf("expected validation failure")
|
||||
}
|
||||
}
|
||||
|
||||
// TestLoadKeepsExplicitServiceChecklist runs one orchestration or CLI step.
|
||||
// Signature: TestLoadKeepsExplicitServiceChecklist(t *testing.T).
|
||||
// Why: host recovery configs must be able to keep a narrow, explicit checklist
|
||||
// without silently inheriting the full default service catalog.
|
||||
func TestLoadKeepsExplicitServiceChecklist(t *testing.T) {
|
||||
cfgPath := filepath.Join(t.TempDir(), "ananke.yaml")
|
||||
raw := `
|
||||
control_planes: [titan-0a]
|
||||
expected_flux_branch: main
|
||||
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||
iac_repo_path: /opt/titan-iac
|
||||
startup:
|
||||
service_checklist_explicit_only: true
|
||||
service_checklist:
|
||||
- name: gitea-api
|
||||
url: https://scm.bstein.dev/api/healthz
|
||||
accepted_statuses: [200]
|
||||
body_contains: pass
|
||||
timeout_seconds: 12
|
||||
ups:
|
||||
enabled: false
|
||||
`
|
||||
if err := os.WriteFile(cfgPath, []byte(raw), 0o644); err != nil {
|
||||
t.Fatalf("write config: %v", err)
|
||||
}
|
||||
|
||||
cfg, err := Load(cfgPath)
|
||||
if err != nil {
|
||||
t.Fatalf("load config: %v", err)
|
||||
}
|
||||
if len(cfg.Startup.ServiceChecklist) != 1 {
|
||||
t.Fatalf("expected 1 explicit service check, got %d", len(cfg.Startup.ServiceChecklist))
|
||||
}
|
||||
if cfg.Startup.ServiceChecklist[0].Name != "gitea-api" {
|
||||
t.Fatalf("expected explicit gitea-api check, got %q", cfg.Startup.ServiceChecklist[0].Name)
|
||||
}
|
||||
}
|
||||
|
||||
@ -27,65 +27,70 @@ type Config struct {
|
||||
}
|
||||
|
||||
type Startup struct {
|
||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
|
||||
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
|
||||
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
|
||||
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
|
||||
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
|
||||
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
||||
TimeSyncMode string `yaml:"time_sync_mode"`
|
||||
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
||||
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
||||
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
||||
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
||||
RequireStorageReady bool `yaml:"require_storage_ready"`
|
||||
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
||||
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
||||
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
||||
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
||||
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
||||
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
||||
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
||||
PostStartProbes []string `yaml:"post_start_probes"`
|
||||
RequireServiceChecklist bool `yaml:"require_service_checklist"`
|
||||
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
|
||||
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
||||
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
||||
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
|
||||
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
||||
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
|
||||
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
|
||||
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
|
||||
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
|
||||
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
|
||||
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
|
||||
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
|
||||
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
|
||||
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
|
||||
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
|
||||
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
|
||||
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
|
||||
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
|
||||
RequireFluxHealth bool `yaml:"require_flux_health"`
|
||||
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
||||
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
||||
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
||||
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
||||
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
||||
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
||||
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
||||
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
||||
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
||||
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
||||
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
||||
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
||||
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
|
||||
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
|
||||
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
|
||||
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
|
||||
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
|
||||
NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"`
|
||||
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
||||
TimeSyncMode string `yaml:"time_sync_mode"`
|
||||
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
||||
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
||||
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
||||
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
||||
RequireStorageReady bool `yaml:"require_storage_ready"`
|
||||
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
||||
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
||||
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
||||
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
||||
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
||||
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
||||
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
||||
PostStartProbes []string `yaml:"post_start_probes"`
|
||||
RequireServiceChecklist bool `yaml:"require_service_checklist"`
|
||||
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
|
||||
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
||||
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
||||
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
|
||||
ServiceChecklistExplicitOnly bool `yaml:"service_checklist_explicit_only"`
|
||||
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
||||
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
|
||||
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
|
||||
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
|
||||
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
|
||||
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
|
||||
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
|
||||
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
|
||||
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
|
||||
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
|
||||
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
|
||||
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
|
||||
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
|
||||
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
|
||||
NodeSSHAuthRequiredNodes []string `yaml:"node_ssh_auth_required_nodes"`
|
||||
RequireFluxHealth bool `yaml:"require_flux_health"`
|
||||
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
||||
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
||||
FluxHealthRequiredKustomizations []string `yaml:"flux_health_required_kustomizations"`
|
||||
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
||||
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
||||
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
||||
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
||||
WorkloadConvergenceRequiredNamespaces []string `yaml:"workload_convergence_required_namespaces"`
|
||||
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
||||
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
||||
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
||||
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
||||
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
||||
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
||||
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
||||
}
|
||||
|
||||
type ServiceChecklistCheck struct {
|
||||
|
||||
@ -61,6 +61,11 @@ func (c Config) Validate() error {
|
||||
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
|
||||
}
|
||||
for _, node := range c.Startup.NodeInventoryReachRequiredNodes {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
|
||||
}
|
||||
}
|
||||
for node, labels := range c.Startup.RequiredNodeLabels {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
|
||||
@ -233,18 +238,37 @@ func (c Config) Validate() error {
|
||||
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
|
||||
}
|
||||
for _, node := range c.Startup.NodeSSHAuthRequiredNodes {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.node_ssh_auth_required_nodes entries must not be empty")
|
||||
}
|
||||
}
|
||||
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
|
||||
}
|
||||
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
|
||||
item = strings.TrimSpace(item)
|
||||
if item == "" {
|
||||
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must not be empty")
|
||||
}
|
||||
if strings.Count(item, "/") != 1 {
|
||||
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must be namespace/name, got %q", item)
|
||||
}
|
||||
}
|
||||
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
|
||||
}
|
||||
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
|
||||
}
|
||||
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
|
||||
if strings.TrimSpace(ns) == "" {
|
||||
return fmt.Errorf("config.startup.workload_convergence_required_namespaces entries must not be empty")
|
||||
}
|
||||
}
|
||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
|
||||
}
|
||||
@ -277,6 +301,16 @@ func (c Config) Validate() error {
|
||||
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
|
||||
}
|
||||
}
|
||||
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
|
||||
if containsTrimmed(c.Startup.IgnoreFluxKustomizations, item) {
|
||||
return fmt.Errorf("config.startup.flux_health_required_kustomizations must not overlap ignore_flux_kustomizations (%q)", strings.TrimSpace(item))
|
||||
}
|
||||
}
|
||||
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
|
||||
if containsTrimmed(c.Startup.IgnoreWorkloadNamespaces, ns) {
|
||||
return fmt.Errorf("config.startup.workload_convergence_required_namespaces must not overlap ignore_workload_namespaces (%q)", strings.TrimSpace(ns))
|
||||
}
|
||||
}
|
||||
for _, node := range c.Startup.IgnoreUnavailableNodes {
|
||||
if strings.TrimSpace(node) == "" {
|
||||
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
|
||||
@ -328,3 +362,20 @@ func (c Config) Validate() error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// containsTrimmed runs one orchestration or CLI step.
|
||||
// Signature: containsTrimmed(entries []string, needle string) bool.
|
||||
// Why: startup config now supports both required and ignored recovery scopes, so
|
||||
// validation needs a single normalized overlap check for those lists.
|
||||
func containsTrimmed(entries []string, needle string) bool {
|
||||
needle = strings.TrimSpace(needle)
|
||||
if needle == "" {
|
||||
return false
|
||||
}
|
||||
for _, entry := range entries {
|
||||
if strings.TrimSpace(entry) == needle {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@ -30,6 +30,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
||||
{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
|
||||
{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
|
||||
{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
|
||||
{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
|
||||
{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
|
||||
{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
|
||||
{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
|
||||
@ -68,15 +69,27 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
||||
{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
|
||||
{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
|
||||
{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
|
||||
{"bad_empty_node_ssh_required_node", func(c *Config) { c.Startup.NodeSSHAuthRequiredNodes = []string{"titan-0a", ""} }},
|
||||
{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
|
||||
{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
|
||||
{"bad_empty_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", ""} }},
|
||||
{"bad_malformed_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system-core"} }},
|
||||
{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
|
||||
{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
|
||||
{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
|
||||
{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
|
||||
{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
|
||||
{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
|
||||
{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
|
||||
{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
|
||||
{"bad_overlap_flux_required_and_ignored", func(c *Config) {
|
||||
c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core"}
|
||||
c.Startup.IgnoreFluxKustomizations = []string{"flux-system/core"}
|
||||
}},
|
||||
{"bad_overlap_workload_required_and_ignored", func(c *Config) {
|
||||
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
|
||||
c.Startup.IgnoreWorkloadNamespaces = []string{"monitoring"}
|
||||
}},
|
||||
{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
|
||||
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
|
||||
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
|
||||
@ -121,6 +134,10 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) {
|
||||
if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
|
||||
t.Fatalf("expected startup defaults to be set")
|
||||
}
|
||||
if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
|
||||
cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
|
||||
t.Fatalf("expected startup recovery scope slices to be initialized")
|
||||
}
|
||||
if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
|
||||
t.Fatalf("expected critical service endpoint timing defaults to be set")
|
||||
}
|
||||
|
||||
@ -79,6 +79,29 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("flux-health-required-kustomizations-ignore-optional-failures", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/monitoring"}
|
||||
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
|
||||
return `{"items":[
|
||||
{"metadata":{"namespace":"flux-system","name":"monitoring"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}},
|
||||
{"metadata":{"namespace":"flux-system","name":"jellyfin"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"optional still down"}]}}
|
||||
]}`, nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
ready, detail, err := orch.TestHookFluxHealthReady(context.Background())
|
||||
if err != nil || !ready || !strings.Contains(detail, "required kustomizations ready=") {
|
||||
t.Fatalf("expected required flux kustomization scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
||||
@ -145,6 +168,42 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("required-workload-namespaces-ignore-optional-failure-pods", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
|
||||
cfg.Startup.StuckPodGraceSeconds = 1
|
||||
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
|
||||
return `{"items":[
|
||||
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
|
||||
{"kind":"Deployment","metadata":{"namespace":"jellyfin","name":"pegasus"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
|
||||
]}`, nil
|
||||
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||
return `{"items":[
|
||||
{"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"grafana"}]},"status":{"containerStatuses":[{"state":{"running":{}}}]}},
|
||||
{"metadata":{"namespace":"jellyfin","name":"pegasus-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"pegasus"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"pegasus"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}
|
||||
]}`, nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
ready, detail, err := orch.TestHookWorkloadConvergenceReady(context.Background())
|
||||
if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
|
||||
t.Fatalf("expected required workload namespace scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
|
||||
}
|
||||
failures, err := orch.TestHookStartupFailurePods(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("startup failure pod query: %v", err)
|
||||
}
|
||||
if len(failures) != 0 {
|
||||
t.Fatalf("expected optional namespace failures to be ignored, got %v", failures)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
|
||||
@ -53,6 +53,48 @@ func TestHookIngressServiceMatrix(t *testing.T) {
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("required-node-labels-skip-ignored-unavailable-nodes", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||
"titan-09": {
|
||||
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||
},
|
||||
}
|
||||
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-09"}
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
|
||||
t.Fatalf("expected ignored unavailable node labels to be skipped, got %q", command)
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
|
||||
t.Fatalf("expected ignored unavailable node label enforcement to be skipped, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("required-node-labels-skip-absent-non-core-nodes", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||
"titan-09": {
|
||||
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||
},
|
||||
}
|
||||
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
|
||||
return "", errors.New("Error from server (NotFound): nodes \"titan-09\" not found")
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
|
||||
t.Fatalf("expected absent non-core node label enforcement to be skipped, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
|
||||
tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
|
||||
@ -633,7 +633,7 @@ func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||
apiVersionCalls++
|
||||
if apiVersionCalls == 1 {
|
||||
if apiVersionCalls <= 2 {
|
||||
return "", errors.New("api down")
|
||||
}
|
||||
return "v1.31.0", nil
|
||||
|
||||
222
testing/orchestrator/hooks_startup_scope_vault_test.go
Normal file
222
testing/orchestrator/hooks_startup_scope_vault_test.go
Normal file
@ -0,0 +1,222 @@
|
||||
package orchestrator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
||||
)
|
||||
|
||||
// readStartupProgress runs one orchestration or CLI step.
|
||||
// Signature: readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string.
|
||||
// Why: startup helper tests need to inspect progress artifacts without reaching
|
||||
// into internal package state from the top-level testing module.
|
||||
func readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string {
|
||||
t.Helper()
|
||||
payload, err := os.ReadFile(orch.TestHookStartupProgressPath())
|
||||
if err != nil {
|
||||
t.Fatalf("read startup progress: %v", err)
|
||||
}
|
||||
return string(payload)
|
||||
}
|
||||
|
||||
// TestHookStartupScopeAndVaultHelpers runs one orchestration or CLI step.
|
||||
// Signature: TestHookStartupScopeAndVaultHelpers(t *testing.T).
|
||||
// Why: keeps startup-scope and startup-Vault helper branches covered from the
|
||||
// split top-level testing module required by the repo hygiene contract.
|
||||
func TestHookStartupScopeAndVaultHelpers(t *testing.T) {
|
||||
t.Run("startup-scope-helpers", func(t *testing.T) {
|
||||
nodes := []string{"titan-db", " titan-23 ", "", "titan-24"}
|
||||
if got := cluster.TestHookStartupRequiredNodes(nodes, nil); len(got) != len(nodes) {
|
||||
t.Fatalf("expected passthrough node list, got %v", got)
|
||||
}
|
||||
got := cluster.TestHookStartupRequiredNodes(nodes, []string{"titan-db", "titan-24"})
|
||||
if len(got) != 2 || got[0] != "titan-db" || got[1] != "titan-24" {
|
||||
t.Fatalf("unexpected filtered node list: %v", got)
|
||||
}
|
||||
|
||||
if !cluster.TestHookContainsNode([]string{"titan-db", " titan-23 "}, " titan-23 ") {
|
||||
t.Fatalf("expected trimmed node membership match")
|
||||
}
|
||||
if cluster.TestHookContainsNode([]string{"titan-db"}, " ") {
|
||||
t.Fatalf("expected blank node probe to be ignored")
|
||||
}
|
||||
|
||||
cfg := lifecycleConfig(t)
|
||||
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
||||
if !orch.TestHookStartupNodeStrictlyRequired("titan-23") {
|
||||
t.Fatalf("expected all nodes to be strict when no recovery scopes are configured")
|
||||
}
|
||||
|
||||
cfgScoped := lifecycleConfig(t)
|
||||
cfgScoped.Startup.NodeInventoryReachRequiredNodes = []string{"titan-23"}
|
||||
cfgScoped.Startup.NodeSSHAuthRequiredNodes = []string{"titan-24"}
|
||||
cfgScoped.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", " flux-system/gitea "}
|
||||
cfgScoped.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault", " monitoring "}
|
||||
orchScoped, _ := newHookOrchestrator(t, cfgScoped, nil, nil)
|
||||
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-db") {
|
||||
t.Fatalf("expected control plane to remain strict")
|
||||
}
|
||||
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-23") {
|
||||
t.Fatalf("expected inventory-scoped node to remain strict")
|
||||
}
|
||||
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-24") {
|
||||
t.Fatalf("expected ssh-scoped node to remain strict")
|
||||
}
|
||||
if orchScoped.TestHookStartupNodeStrictlyRequired("titan-25") {
|
||||
t.Fatalf("expected non-core worker to stop being strict")
|
||||
}
|
||||
|
||||
flux := orchScoped.TestHookStartupRequiredFluxKustomizations()
|
||||
if _, ok := flux["flux-system/core"]; !ok {
|
||||
t.Fatalf("expected core flux kustomization in required set: %v", flux)
|
||||
}
|
||||
if _, ok := flux["flux-system/gitea"]; !ok {
|
||||
t.Fatalf("expected trimmed gitea kustomization in required set: %v", flux)
|
||||
}
|
||||
|
||||
namespaces := orchScoped.TestHookStartupRequiredWorkloadNamespaces()
|
||||
if _, ok := namespaces["vault"]; !ok {
|
||||
t.Fatalf("expected vault namespace in required set: %v", namespaces)
|
||||
}
|
||||
if _, ok := namespaces["monitoring"]; !ok {
|
||||
t.Fatalf("expected trimmed monitoring namespace in required set: %v", namespaces)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("startup-vault-helpers", func(t *testing.T) {
|
||||
t.Run("early-vault-unseal-paths", func(t *testing.T) {
|
||||
cfgAPI := lifecycleConfig(t)
|
||||
runAPI := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
|
||||
return "", errors.New("api down")
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchAPI, _ := newHookOrchestrator(t, cfgAPI, runAPI, runAPI)
|
||||
orchAPI.TestHookBeginStartupReport("startup-vault")
|
||||
orchAPI.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||
if payload := readStartupProgress(t, orchAPI); strings.Contains(payload, "vault-unseal-early") {
|
||||
t.Fatalf("expected no early vault check when api is unavailable, payload=%s", payload)
|
||||
}
|
||||
|
||||
cfgErr := lifecycleConfig(t)
|
||||
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||
return "v1.31.0", nil
|
||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||
return "", errors.New("phase probe failed")
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
|
||||
orchErr.TestHookBeginStartupReport("startup-vault")
|
||||
orchErr.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||
if payload := readStartupProgress(t, orchErr); !strings.Contains(payload, "deferred early vault unseal") {
|
||||
t.Fatalf("expected early vault auto-heal detail, payload=%s", payload)
|
||||
}
|
||||
|
||||
cfgDeferred := lifecycleConfig(t)
|
||||
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||
return "v1.31.0", nil
|
||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||
return "Pending", nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
|
||||
orchDeferred.TestHookBeginStartupReport("startup-vault")
|
||||
orchDeferred.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
|
||||
t.Fatalf("expected deferred early vault detail, payload=%s", payload)
|
||||
}
|
||||
|
||||
cfgSuccess := lifecycleConfig(t)
|
||||
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||
return "v1.31.0", nil
|
||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||
return "Running", nil
|
||||
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
|
||||
return `{"sealed":false,"initialized":true}`, nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
|
||||
orchSuccess.TestHookBeginStartupReport("startup-vault")
|
||||
orchSuccess.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault-unseal-early"`) || !strings.Contains(payload, `"passed"`) {
|
||||
t.Fatalf("expected successful early vault check, payload=%s", payload)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("startup-vault-gate-paths", func(t *testing.T) {
|
||||
cfgErr := lifecycleConfig(t)
|
||||
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
|
||||
return "", errors.New("phase probe failed")
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
|
||||
orchErr.TestHookBeginStartupReport("startup-vault")
|
||||
if err := orchErr.TestHookRunStartupVaultUnsealGate(context.Background()); err == nil || !strings.Contains(err.Error(), "phase probe failed") {
|
||||
t.Fatalf("expected startup vault gate error, got %v", err)
|
||||
}
|
||||
|
||||
cfgDeferred := lifecycleConfig(t)
|
||||
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
|
||||
return "Pending", nil
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
|
||||
orchDeferred.TestHookBeginStartupReport("startup-vault")
|
||||
if err := orchDeferred.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
|
||||
t.Fatalf("expected deferred startup vault gate to succeed, got %v", err)
|
||||
}
|
||||
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
|
||||
t.Fatalf("expected deferred startup vault detail, payload=%s", payload)
|
||||
}
|
||||
|
||||
cfgSuccess := lifecycleConfig(t)
|
||||
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
switch {
|
||||
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||
return "Running", nil
|
||||
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
|
||||
return `{"sealed":false,"initialized":true}`, nil
|
||||
default:
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
}
|
||||
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
|
||||
orchSuccess.TestHookBeginStartupReport("startup-vault")
|
||||
if err := orchSuccess.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
|
||||
t.Fatalf("expected successful startup vault gate, got %v", err)
|
||||
}
|
||||
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault is unsealed"`) {
|
||||
t.Fatalf("expected successful startup vault detail, payload=%s", payload)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
@ -165,6 +165,32 @@ func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
|
||||
t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("wait-for-node-gates-only-require-core-nodes", func(t *testing.T) {
|
||||
cfg := lifecycleConfig(t)
|
||||
cfg.Startup.RequireNodeSSHAuth = true
|
||||
cfg.Startup.NodeSSHAuthWaitSeconds = 1
|
||||
cfg.Startup.NodeSSHAuthPollSeconds = 1
|
||||
cfg.Startup.NodeInventoryReachWaitSeconds = 1
|
||||
cfg.Startup.NodeInventoryReachPollSeconds = 1
|
||||
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
|
||||
cfg.Startup.NodeSSHAuthRequiredNodes = []string{"titan-db"}
|
||||
|
||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||
command := name + " " + strings.Join(args, " ")
|
||||
if name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "__ANANKE_") {
|
||||
return "", errors.New("no route to host")
|
||||
}
|
||||
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||
}
|
||||
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||
if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
|
||||
t.Fatalf("expected ssh-auth gate to ignore non-core worker failure, got %v", err)
|
||||
}
|
||||
if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err != nil {
|
||||
t.Fatalf("expected inventory reachability gate to ignore non-core worker failure, got %v", err)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user