startup(ananke): scope emergency recovery to core services
This commit is contained in:
parent
a3e24b9b15
commit
1f656de5df
14
README.md
14
README.md
@ -97,10 +97,15 @@ Primary config path:
|
|||||||
Keep these fields accurate:
|
Keep these fields accurate:
|
||||||
- `expected_flux_source_url`
|
- `expected_flux_source_url`
|
||||||
- `expected_flux_branch`
|
- `expected_flux_branch`
|
||||||
|
- `startup.service_checklist_explicit_only`
|
||||||
- `startup.service_checklist`
|
- `startup.service_checklist`
|
||||||
- `startup.critical_service_endpoints`
|
- `startup.critical_service_endpoints`
|
||||||
- `startup.require_ingress_checklist`
|
- `startup.require_ingress_checklist`
|
||||||
- `startup.require_node_inventory_reachability`
|
- `startup.require_node_inventory_reachability`
|
||||||
|
- `startup.node_inventory_reachability_required_nodes`
|
||||||
|
- `startup.node_ssh_auth_required_nodes`
|
||||||
|
- `startup.flux_health_required_kustomizations`
|
||||||
|
- `startup.workload_convergence_required_namespaces`
|
||||||
- `startup.ignore_unavailable_nodes`
|
- `startup.ignore_unavailable_nodes`
|
||||||
- `coordination.role`
|
- `coordination.role`
|
||||||
- `coordination.peer_hosts`
|
- `coordination.peer_hosts`
|
||||||
@ -134,9 +139,10 @@ Installer behavior:
|
|||||||
|
|
||||||
When adding nodes or services:
|
When adding nodes or services:
|
||||||
1. Update inventory and node mapping in config.
|
1. Update inventory and node mapping in config.
|
||||||
2. Add/adjust service checklist entries for anything user-facing or critical.
|
2. Keep the explicit service checklist focused on the core services that must come back during an outage.
|
||||||
3. Add/adjust ingress expectations for exposed services.
|
3. Keep `*_required_*` startup scopes aligned with the same core set so optional stacks do not block bootstrap.
|
||||||
4. Use temporary ignores only when truly intentional, then remove them.
|
4. Add/adjust ingress expectations for exposed services.
|
||||||
5. Run `scripts/quality_gate.sh` before host deployment.
|
5. Use temporary ignores only when truly intentional, then remove them.
|
||||||
|
6. Run `scripts/quality_gate.sh` before host deployment.
|
||||||
|
|
||||||
Recovery quality should improve over time: every drill should reduce manual work in the next drill.
|
Recovery quality should improve over time: every drill should reduce manual work in the next drill.
|
||||||
|
|||||||
@ -51,6 +51,7 @@ startup:
|
|||||||
require_node_inventory_reachability: true
|
require_node_inventory_reachability: true
|
||||||
node_inventory_reachability_wait_seconds: 300
|
node_inventory_reachability_wait_seconds: 300
|
||||||
node_inventory_reachability_poll_seconds: 5
|
node_inventory_reachability_poll_seconds: 5
|
||||||
|
node_inventory_reachability_required_nodes: []
|
||||||
required_node_labels:
|
required_node_labels:
|
||||||
titan-09:
|
titan-09:
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
@ -90,6 +91,7 @@ startup:
|
|||||||
admin_secret_name: keycloak-admin
|
admin_secret_name: keycloak-admin
|
||||||
admin_secret_username_key: username
|
admin_secret_username_key: username
|
||||||
admin_secret_password_key: password
|
admin_secret_password_key: password
|
||||||
|
service_checklist_explicit_only: false
|
||||||
service_checklist:
|
service_checklist:
|
||||||
- name: gitea-api
|
- name: gitea-api
|
||||||
url: https://scm.bstein.dev/api/healthz
|
url: https://scm.bstein.dev/api/healthz
|
||||||
@ -134,13 +136,16 @@ startup:
|
|||||||
require_node_ssh_auth: true
|
require_node_ssh_auth: true
|
||||||
node_ssh_auth_wait_seconds: 240
|
node_ssh_auth_wait_seconds: 240
|
||||||
node_ssh_auth_poll_seconds: 5
|
node_ssh_auth_poll_seconds: 5
|
||||||
|
node_ssh_auth_required_nodes: []
|
||||||
require_flux_health: true
|
require_flux_health: true
|
||||||
flux_health_wait_seconds: 900
|
flux_health_wait_seconds: 900
|
||||||
flux_health_poll_seconds: 5
|
flux_health_poll_seconds: 5
|
||||||
|
flux_health_required_kustomizations: []
|
||||||
ignore_flux_kustomizations: []
|
ignore_flux_kustomizations: []
|
||||||
require_workload_convergence: true
|
require_workload_convergence: true
|
||||||
workload_convergence_wait_seconds: 900
|
workload_convergence_wait_seconds: 900
|
||||||
workload_convergence_poll_seconds: 5
|
workload_convergence_poll_seconds: 5
|
||||||
|
workload_convergence_required_namespaces: []
|
||||||
ignore_workload_namespaces: []
|
ignore_workload_namespaces: []
|
||||||
ignore_workloads: []
|
ignore_workloads: []
|
||||||
ignore_unavailable_nodes: []
|
ignore_unavailable_nodes: []
|
||||||
|
|||||||
@ -117,6 +117,10 @@ startup:
|
|||||||
require_node_inventory_reachability: true
|
require_node_inventory_reachability: true
|
||||||
node_inventory_reachability_wait_seconds: 300
|
node_inventory_reachability_wait_seconds: 300
|
||||||
node_inventory_reachability_poll_seconds: 5
|
node_inventory_reachability_poll_seconds: 5
|
||||||
|
node_inventory_reachability_required_nodes:
|
||||||
|
- titan-0a
|
||||||
|
- titan-0b
|
||||||
|
- titan-0c
|
||||||
required_node_labels:
|
required_node_labels:
|
||||||
titan-09:
|
titan-09:
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
@ -156,6 +160,7 @@ startup:
|
|||||||
admin_secret_name: keycloak-admin
|
admin_secret_name: keycloak-admin
|
||||||
admin_secret_username_key: username
|
admin_secret_username_key: username
|
||||||
admin_secret_password_key: password
|
admin_secret_password_key: password
|
||||||
|
service_checklist_explicit_only: true
|
||||||
service_checklist:
|
service_checklist:
|
||||||
- name: gitea-api
|
- name: gitea-api
|
||||||
url: https://scm.bstein.dev/api/healthz
|
url: https://scm.bstein.dev/api/healthz
|
||||||
@ -200,13 +205,39 @@ startup:
|
|||||||
require_node_ssh_auth: true
|
require_node_ssh_auth: true
|
||||||
node_ssh_auth_wait_seconds: 240
|
node_ssh_auth_wait_seconds: 240
|
||||||
node_ssh_auth_poll_seconds: 5
|
node_ssh_auth_poll_seconds: 5
|
||||||
|
node_ssh_auth_required_nodes:
|
||||||
|
- titan-0a
|
||||||
|
- titan-0b
|
||||||
|
- titan-0c
|
||||||
require_flux_health: true
|
require_flux_health: true
|
||||||
flux_health_wait_seconds: 900
|
flux_health_wait_seconds: 900
|
||||||
flux_health_poll_seconds: 5
|
flux_health_poll_seconds: 5
|
||||||
|
flux_health_required_kustomizations:
|
||||||
|
- flux-system/core
|
||||||
|
- flux-system/helm
|
||||||
|
- flux-system/traefik
|
||||||
|
- flux-system/cert-manager
|
||||||
|
- flux-system/longhorn
|
||||||
|
- flux-system/vault-csi
|
||||||
|
- flux-system/vault-injector
|
||||||
|
- flux-system/postgres
|
||||||
|
- flux-system/vault
|
||||||
|
- flux-system/keycloak
|
||||||
|
- flux-system/oauth2-proxy
|
||||||
|
- flux-system/gitea
|
||||||
|
- flux-system/monitoring
|
||||||
|
- flux-system/harbor
|
||||||
ignore_flux_kustomizations: []
|
ignore_flux_kustomizations: []
|
||||||
require_workload_convergence: true
|
require_workload_convergence: true
|
||||||
workload_convergence_wait_seconds: 900
|
workload_convergence_wait_seconds: 900
|
||||||
workload_convergence_poll_seconds: 5
|
workload_convergence_poll_seconds: 5
|
||||||
|
workload_convergence_required_namespaces:
|
||||||
|
- vault
|
||||||
|
- postgres
|
||||||
|
- sso
|
||||||
|
- gitea
|
||||||
|
- monitoring
|
||||||
|
- harbor
|
||||||
ignore_workload_namespaces: []
|
ignore_workload_namespaces: []
|
||||||
ignore_workloads: []
|
ignore_workloads: []
|
||||||
ignore_unavailable_nodes: []
|
ignore_unavailable_nodes: []
|
||||||
|
|||||||
@ -117,6 +117,10 @@ startup:
|
|||||||
require_node_inventory_reachability: true
|
require_node_inventory_reachability: true
|
||||||
node_inventory_reachability_wait_seconds: 300
|
node_inventory_reachability_wait_seconds: 300
|
||||||
node_inventory_reachability_poll_seconds: 5
|
node_inventory_reachability_poll_seconds: 5
|
||||||
|
node_inventory_reachability_required_nodes:
|
||||||
|
- titan-0a
|
||||||
|
- titan-0b
|
||||||
|
- titan-0c
|
||||||
required_node_labels:
|
required_node_labels:
|
||||||
titan-09:
|
titan-09:
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
@ -156,6 +160,7 @@ startup:
|
|||||||
admin_secret_name: keycloak-admin
|
admin_secret_name: keycloak-admin
|
||||||
admin_secret_username_key: username
|
admin_secret_username_key: username
|
||||||
admin_secret_password_key: password
|
admin_secret_password_key: password
|
||||||
|
service_checklist_explicit_only: true
|
||||||
service_checklist:
|
service_checklist:
|
||||||
- name: gitea-api
|
- name: gitea-api
|
||||||
url: https://scm.bstein.dev/api/healthz
|
url: https://scm.bstein.dev/api/healthz
|
||||||
@ -200,13 +205,39 @@ startup:
|
|||||||
require_node_ssh_auth: true
|
require_node_ssh_auth: true
|
||||||
node_ssh_auth_wait_seconds: 240
|
node_ssh_auth_wait_seconds: 240
|
||||||
node_ssh_auth_poll_seconds: 5
|
node_ssh_auth_poll_seconds: 5
|
||||||
|
node_ssh_auth_required_nodes:
|
||||||
|
- titan-0a
|
||||||
|
- titan-0b
|
||||||
|
- titan-0c
|
||||||
require_flux_health: true
|
require_flux_health: true
|
||||||
flux_health_wait_seconds: 900
|
flux_health_wait_seconds: 900
|
||||||
flux_health_poll_seconds: 5
|
flux_health_poll_seconds: 5
|
||||||
|
flux_health_required_kustomizations:
|
||||||
|
- flux-system/core
|
||||||
|
- flux-system/helm
|
||||||
|
- flux-system/traefik
|
||||||
|
- flux-system/cert-manager
|
||||||
|
- flux-system/longhorn
|
||||||
|
- flux-system/vault-csi
|
||||||
|
- flux-system/vault-injector
|
||||||
|
- flux-system/postgres
|
||||||
|
- flux-system/vault
|
||||||
|
- flux-system/keycloak
|
||||||
|
- flux-system/oauth2-proxy
|
||||||
|
- flux-system/gitea
|
||||||
|
- flux-system/monitoring
|
||||||
|
- flux-system/harbor
|
||||||
ignore_flux_kustomizations: []
|
ignore_flux_kustomizations: []
|
||||||
require_workload_convergence: true
|
require_workload_convergence: true
|
||||||
workload_convergence_wait_seconds: 900
|
workload_convergence_wait_seconds: 900
|
||||||
workload_convergence_poll_seconds: 5
|
workload_convergence_poll_seconds: 5
|
||||||
|
workload_convergence_required_namespaces:
|
||||||
|
- vault
|
||||||
|
- postgres
|
||||||
|
- sso
|
||||||
|
- gitea
|
||||||
|
- monitoring
|
||||||
|
- harbor
|
||||||
ignore_workload_namespaces: []
|
ignore_workload_namespaces: []
|
||||||
ignore_workloads: []
|
ignore_workloads: []
|
||||||
ignore_unavailable_nodes: []
|
ignore_unavailable_nodes: []
|
||||||
|
|||||||
@ -77,7 +77,7 @@ func (o *Orchestrator) waitForNodeSSHAuth(ctx context.Context, nodes []string) e
|
|||||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||||
seen := map[string]struct{}{}
|
seen := map[string]struct{}{}
|
||||||
targets := make([]string, 0, len(nodes))
|
targets := make([]string, 0, len(nodes))
|
||||||
for _, node := range nodes {
|
for _, node := range startupRequiredNodes(nodes, o.cfg.Startup.NodeSSHAuthRequiredNodes) {
|
||||||
node = strings.TrimSpace(node)
|
node = strings.TrimSpace(node)
|
||||||
if node == "" {
|
if node == "" {
|
||||||
continue
|
continue
|
||||||
|
|||||||
@ -227,6 +227,31 @@ func (o *Orchestrator) waitVaultReady(ctx context.Context, w startupWorkload) er
|
|||||||
return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
|
return fmt.Errorf("wait ready %s/%s/%s: timeout", w.Namespace, w.Kind, w.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ensureVaultUnsealedWhenRunnable runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error).
|
||||||
|
// Why: lets startup defer vault unseal until the pod is actually runnable, while
|
||||||
|
// keeping the direct unseal helper strict for explicit recovery paths and tests.
|
||||||
|
func (o *Orchestrator) ensureVaultUnsealedWhenRunnable(ctx context.Context) (bool, string, error) {
|
||||||
|
if o.runner.DryRun {
|
||||||
|
return false, "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
|
||||||
|
if err != nil {
|
||||||
|
if isNotFoundErr(err) {
|
||||||
|
return true, "vault-0 pod is not present yet; deferring unseal until critical workload recovery", nil
|
||||||
|
}
|
||||||
|
return false, "", fmt.Errorf("vault pod phase check failed: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
trimmedPhase := strings.TrimSpace(phase)
|
||||||
|
if trimmedPhase != "Running" {
|
||||||
|
return true, fmt.Sprintf("vault-0 pod phase is %q; deferring unseal until critical workload recovery", trimmedPhase), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return false, "", o.ensureVaultUnsealed(ctx)
|
||||||
|
}
|
||||||
|
|
||||||
// ensureVaultUnsealed runs one orchestration or CLI step.
|
// ensureVaultUnsealed runs one orchestration or CLI step.
|
||||||
// Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
|
// Signature: (o *Orchestrator) ensureVaultUnsealed(ctx context.Context) error.
|
||||||
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
// Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve.
|
||||||
|
|||||||
@ -143,6 +143,8 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
|||||||
return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
|
return false, "", fmt.Errorf("decode flux kustomizations: %w", err)
|
||||||
}
|
}
|
||||||
ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
|
ignored := makeStringSet(o.cfg.Startup.IgnoreFluxKustomizations)
|
||||||
|
required := o.startupRequiredFluxKustomizations()
|
||||||
|
requiredSeen := map[string]struct{}{}
|
||||||
notReady := []string{}
|
notReady := []string{}
|
||||||
for _, ks := range list.Items {
|
for _, ks := range list.Items {
|
||||||
ns := strings.TrimSpace(ks.Metadata.Namespace)
|
ns := strings.TrimSpace(ks.Metadata.Namespace)
|
||||||
@ -154,6 +156,12 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
|||||||
if ks.Spec.Suspend {
|
if ks.Spec.Suspend {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if len(required) > 0 {
|
||||||
|
if _, ok := required[full]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
requiredSeen[full] = struct{}{}
|
||||||
|
}
|
||||||
if _, ok := ignored[full]; ok {
|
if _, ok := ignored[full]; ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -173,10 +181,25 @@ func (o *Orchestrator) fluxHealthReady(ctx context.Context) (bool, string, error
|
|||||||
}
|
}
|
||||||
notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
|
notReady = append(notReady, fmt.Sprintf("%s(%s)", full, reason))
|
||||||
}
|
}
|
||||||
|
if len(required) > 0 {
|
||||||
|
missing := []string{}
|
||||||
|
for full := range required {
|
||||||
|
if _, ok := requiredSeen[full]; !ok {
|
||||||
|
missing = append(missing, full+"(missing)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(missing) > 0 {
|
||||||
|
sort.Strings(missing)
|
||||||
|
notReady = append(notReady, missing...)
|
||||||
|
}
|
||||||
|
}
|
||||||
if len(notReady) > 0 {
|
if len(notReady) > 0 {
|
||||||
sort.Strings(notReady)
|
sort.Strings(notReady)
|
||||||
return false, "not ready: " + joinLimited(notReady, 6), nil
|
return false, "not ready: " + joinLimited(notReady, 6), nil
|
||||||
}
|
}
|
||||||
|
if len(required) > 0 {
|
||||||
|
return true, fmt.Sprintf("required kustomizations ready=%d", len(requiredSeen)), nil
|
||||||
|
}
|
||||||
return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
|
return true, fmt.Sprintf("all kustomizations ready=%d", len(list.Items)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -19,6 +19,7 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
|||||||
if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
|
if o.runner.DryRun || len(o.cfg.Startup.RequiredNodeLabels) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||||
nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
|
nodes := make([]string, 0, len(o.cfg.Startup.RequiredNodeLabels))
|
||||||
for node := range o.cfg.Startup.RequiredNodeLabels {
|
for node := range o.cfg.Startup.RequiredNodeLabels {
|
||||||
node = strings.TrimSpace(node)
|
node = strings.TrimSpace(node)
|
||||||
@ -28,6 +29,10 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
|||||||
}
|
}
|
||||||
sort.Strings(nodes)
|
sort.Strings(nodes)
|
||||||
for _, node := range nodes {
|
for _, node := range nodes {
|
||||||
|
if _, skip := ignored[node]; skip {
|
||||||
|
o.log.Printf("skipping required node labels for ignored unavailable node %s", node)
|
||||||
|
continue
|
||||||
|
}
|
||||||
labels := o.cfg.Startup.RequiredNodeLabels[node]
|
labels := o.cfg.Startup.RequiredNodeLabels[node]
|
||||||
if len(labels) == 0 {
|
if len(labels) == 0 {
|
||||||
continue
|
continue
|
||||||
@ -55,6 +60,11 @@ func (o *Orchestrator) ensureRequiredNodeLabels(ctx context.Context) error {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
|
if _, err := o.kubectl(ctx, 25*time.Second, args...); err != nil {
|
||||||
|
if isNotFoundErr(err) && !o.startupNodeStrictlyRequired(node) {
|
||||||
|
o.log.Printf("warning: skipping required labels for absent non-core node %s: %v", node, err)
|
||||||
|
o.noteStartupAutoHeal(fmt.Sprintf("skipped required node labels for absent non-core node %s", node))
|
||||||
|
continue
|
||||||
|
}
|
||||||
return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
|
return fmt.Errorf("ensure required node labels on %s (%s): %w", node, strings.Join(pairs, ", "), err)
|
||||||
}
|
}
|
||||||
o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))
|
o.log.Printf("ensured required labels on node %s: %s", node, strings.Join(pairs, ", "))
|
||||||
|
|||||||
@ -37,14 +37,7 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
return invErr
|
return invErr
|
||||||
}
|
}
|
||||||
o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
|
o.noteStartupCheck("node-inventory", true, "inventory/user/port validation passed")
|
||||||
if err := o.waitForAPI(ctx, 1, time.Second); err == nil {
|
o.maybeRunEarlyVaultUnseal(ctx)
|
||||||
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed while kubernetes api is already available")
|
|
||||||
if err := o.ensureVaultUnsealed(ctx); err != nil {
|
|
||||||
o.noteStartupCheck("vault-unseal", false, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
|
|
||||||
}
|
|
||||||
o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
|
o.setStartupPhase("preflight-node-reachability", "waiting for ssh reachability across configured inventory")
|
||||||
if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
|
if reachErr := o.waitForNodeInventoryReachability(ctx); reachErr != nil {
|
||||||
o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
|
o.noteStartupCheck("node-inventory-reachability", false, reachErr.Error())
|
||||||
@ -187,12 +180,9 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
|
o.noteStartupCheck("kubernetes-api", true, "kubernetes api reachable")
|
||||||
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
|
if err := o.runStartupVaultUnsealGate(ctx); err != nil {
|
||||||
if err := o.ensureVaultUnsealed(ctx); err != nil {
|
|
||||||
o.noteStartupCheck("vault-unseal", false, err.Error())
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
|
|
||||||
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
|
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -490,18 +480,3 @@ func (o *Orchestrator) Shutdown(ctx context.Context, opts ShutdownOptions) (err
|
|||||||
o.log.Printf("shutdown flow complete")
|
o.log.Printf("shutdown flow complete")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// normalizeShutdownMode runs one orchestration or CLI step.
|
|
||||||
// Signature: normalizeShutdownMode(raw string) (string, error).
|
|
||||||
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
|
|
||||||
// semantics while preserving compatibility with legacy "config" callers.
|
|
||||||
func normalizeShutdownMode(raw string) (string, error) {
|
|
||||||
switch strings.TrimSpace(raw) {
|
|
||||||
case "", "config", "cluster-only":
|
|
||||||
return "cluster-only", nil
|
|
||||||
case "poweroff":
|
|
||||||
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
|
|
||||||
default:
|
|
||||||
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@ -30,7 +30,7 @@ func (o *Orchestrator) waitForNodeInventoryReachability(ctx context.Context) err
|
|||||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||||
targets := make([]string, 0, len(o.inventoryNodesForValidation()))
|
targets := make([]string, 0, len(o.inventoryNodesForValidation()))
|
||||||
seen := map[string]struct{}{}
|
seen := map[string]struct{}{}
|
||||||
for _, node := range o.inventoryNodesForValidation() {
|
for _, node := range startupRequiredNodes(o.inventoryNodesForValidation(), o.cfg.Startup.NodeInventoryReachRequiredNodes) {
|
||||||
node = strings.TrimSpace(node)
|
node = strings.TrimSpace(node)
|
||||||
if node == "" {
|
if node == "" {
|
||||||
continue
|
continue
|
||||||
|
|||||||
21
internal/cluster/orchestrator_shutdown_mode.go
Normal file
21
internal/cluster/orchestrator_shutdown_mode.go
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
package cluster
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// normalizeShutdownMode runs one orchestration or CLI step.
|
||||||
|
// Signature: normalizeShutdownMode(raw string) (string, error).
|
||||||
|
// Why: keeps shutdown behavior explicit and safe by allowing only cluster-only
|
||||||
|
// semantics while preserving compatibility with legacy "config" callers.
|
||||||
|
func normalizeShutdownMode(raw string) (string, error) {
|
||||||
|
switch strings.TrimSpace(raw) {
|
||||||
|
case "", "config", "cluster-only":
|
||||||
|
return "cluster-only", nil
|
||||||
|
case "poweroff":
|
||||||
|
return "", fmt.Errorf("shutdown mode %q has been removed; ananke no longer powers off hosts", raw)
|
||||||
|
default:
|
||||||
|
return "", fmt.Errorf("unsupported shutdown mode %q (expected config|cluster-only)", raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
81
internal/cluster/orchestrator_startup_scope.go
Normal file
81
internal/cluster/orchestrator_startup_scope.go
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
package cluster
|
||||||
|
|
||||||
|
import "strings"
|
||||||
|
|
||||||
|
// startupRequiredNodes runs one orchestration or CLI step.
|
||||||
|
// Signature: startupRequiredNodes(nodes []string, required []string) []string.
|
||||||
|
// Why: lets startup enforce a smaller core node set during outage recovery
|
||||||
|
// without losing the stricter all-nodes behavior when no override is configured.
|
||||||
|
func startupRequiredNodes(nodes []string, required []string) []string {
|
||||||
|
requiredSet := makeStringSet(required)
|
||||||
|
if len(requiredSet) == 0 {
|
||||||
|
return nodes
|
||||||
|
}
|
||||||
|
filtered := make([]string, 0, len(nodes))
|
||||||
|
for _, node := range nodes {
|
||||||
|
node = strings.TrimSpace(node)
|
||||||
|
if node == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := requiredSet[node]; ok {
|
||||||
|
filtered = append(filtered, node)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return filtered
|
||||||
|
}
|
||||||
|
|
||||||
|
// startupNodeStrictlyRequired runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) startupNodeStrictlyRequired(node string) bool.
|
||||||
|
// Why: absent or broken non-core nodes should not block recovery-only actions
|
||||||
|
// like label reconciliation once the operator has narrowed startup to core nodes.
|
||||||
|
func (o *Orchestrator) startupNodeStrictlyRequired(node string) bool {
|
||||||
|
node = strings.TrimSpace(node)
|
||||||
|
if node == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if len(o.cfg.Startup.NodeInventoryReachRequiredNodes) == 0 && len(o.cfg.Startup.NodeSSHAuthRequiredNodes) == 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
for _, controlPlane := range o.cfg.ControlPlanes {
|
||||||
|
if strings.TrimSpace(controlPlane) == node {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if containsNode(o.cfg.Startup.NodeInventoryReachRequiredNodes, node) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return containsNode(o.cfg.Startup.NodeSSHAuthRequiredNodes, node)
|
||||||
|
}
|
||||||
|
|
||||||
|
// startupRequiredFluxKustomizations runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{}.
|
||||||
|
// Why: lets outage recovery wait on a declared core GitOps slice while leaving
|
||||||
|
// optional stacks free to converge after bootstrap succeeds.
|
||||||
|
func (o *Orchestrator) startupRequiredFluxKustomizations() map[string]struct{} {
|
||||||
|
return makeStringSet(o.cfg.Startup.FluxHealthRequiredKustomizations)
|
||||||
|
}
|
||||||
|
|
||||||
|
// startupRequiredWorkloadNamespaces runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{}.
|
||||||
|
// Why: keeps workload readiness scoped to core namespaces during recovery while
|
||||||
|
// preserving broad convergence checks when no explicit core list is configured.
|
||||||
|
func (o *Orchestrator) startupRequiredWorkloadNamespaces() map[string]struct{} {
|
||||||
|
return makeStringSet(o.cfg.Startup.WorkloadConvergenceRequiredNamespaces)
|
||||||
|
}
|
||||||
|
|
||||||
|
// containsNode runs one orchestration or CLI step.
|
||||||
|
// Signature: containsNode(entries []string, needle string) bool.
|
||||||
|
// Why: keeps node-scope checks small and explicit anywhere startup narrows its
|
||||||
|
// recovery gates to a declared core set.
|
||||||
|
func containsNode(entries []string, needle string) bool {
|
||||||
|
needle = strings.TrimSpace(needle)
|
||||||
|
if needle == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, entry := range entries {
|
||||||
|
if strings.TrimSpace(entry) == needle {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
52
internal/cluster/orchestrator_startup_vault.go
Normal file
52
internal/cluster/orchestrator_startup_vault.go
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
package cluster
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// maybeRunEarlyVaultUnseal runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context).
|
||||||
|
// Why: gives startup a best-effort Vault recovery path when the API is already
|
||||||
|
// live, without consuming the hard startup failure path before workloads recover.
|
||||||
|
func (o *Orchestrator) maybeRunEarlyVaultUnseal(ctx context.Context) {
|
||||||
|
if err := o.waitForAPI(ctx, 1, time.Second); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
o.noteStartupCheckState("vault-unseal-early", "running", "best-effort early vault unseal while kubernetes api is already available")
|
||||||
|
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
|
||||||
|
if err != nil {
|
||||||
|
o.log.Printf("warning: early vault unseal deferred: %v", err)
|
||||||
|
o.noteStartupAutoHeal(fmt.Sprintf("deferred early vault unseal: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if deferred {
|
||||||
|
o.log.Printf("vault early unseal deferred: %s", detail)
|
||||||
|
o.noteStartupAutoHeal(detail)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
o.noteStartupCheck("vault-unseal-early", true, "vault is already unsealed")
|
||||||
|
}
|
||||||
|
|
||||||
|
// runStartupVaultUnsealGate runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error.
|
||||||
|
// Why: keeps the top-level startup flow readable while allowing Vault unseal to
|
||||||
|
// defer cleanly until critical workload recovery when the pod is not runnable yet.
|
||||||
|
func (o *Orchestrator) runStartupVaultUnsealGate(ctx context.Context) error {
|
||||||
|
o.noteStartupCheckState("vault-unseal", "running", "ensuring vault is unsealed before startup gates")
|
||||||
|
deferred, detail, err := o.ensureVaultUnsealedWhenRunnable(ctx)
|
||||||
|
if err != nil {
|
||||||
|
o.noteStartupCheck("vault-unseal", false, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if deferred {
|
||||||
|
o.log.Printf("vault unseal deferred until workload recovery: %s", detail)
|
||||||
|
o.noteStartupAutoHeal(detail)
|
||||||
|
o.noteStartupCheck("vault-unseal", true, detail)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
o.noteStartupCheck("vault-unseal", true, "vault is unsealed")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
@ -71,6 +71,7 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
|
|||||||
if err := json.Unmarshal([]byte(out), &list); err != nil {
|
if err := json.Unmarshal([]byte(out), &list); err != nil {
|
||||||
return false, "", fmt.Errorf("decode controllers: %w", err)
|
return false, "", fmt.Errorf("decode controllers: %w", err)
|
||||||
}
|
}
|
||||||
|
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
|
||||||
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
||||||
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||||
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
|
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
|
||||||
@ -84,6 +85,11 @@ func (o *Orchestrator) workloadConvergenceReady(ctx context.Context) (bool, stri
|
|||||||
if kind == "" || ns == "" || name == "" {
|
if kind == "" || ns == "" || name == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if len(requiredNamespaces) > 0 {
|
||||||
|
if _, ok := requiredNamespaces[ns]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
if _, ok := ignoredNamespaces[ns]; ok {
|
if _, ok := ignoredNamespaces[ns]; ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@ -116,6 +116,7 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
|||||||
return nil, fmt.Errorf("decode pods: %w", err)
|
return nil, fmt.Errorf("decode pods: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
|
||||||
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
||||||
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||||
stuckReasons := map[string]struct{}{
|
stuckReasons := map[string]struct{}{
|
||||||
@ -138,6 +139,11 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
|||||||
if ns == "" || name == "" {
|
if ns == "" || name == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if len(requiredNamespaces) > 0 {
|
||||||
|
if _, ok := requiredNamespaces[ns]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
if _, ok := ignoredNamespaces[ns]; ok {
|
if _, ok := ignoredNamespaces[ns]; ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
55
internal/cluster/testing_hooks_startup.go
Normal file
55
internal/cluster/testing_hooks_startup.go
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
package cluster
|
||||||
|
|
||||||
|
import "context"
|
||||||
|
|
||||||
|
// TestHookStartupRequiredNodes runs one orchestration or CLI step.
|
||||||
|
// Signature: TestHookStartupRequiredNodes(nodes []string, required []string) []string.
|
||||||
|
// Why: exposes recovery-scope node filtering so top-level tests can cover core-only startup narrowing.
|
||||||
|
func TestHookStartupRequiredNodes(nodes []string, required []string) []string {
|
||||||
|
return startupRequiredNodes(nodes, required)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestHookContainsNode runs one orchestration or CLI step.
|
||||||
|
// Signature: TestHookContainsNode(entries []string, needle string) bool.
|
||||||
|
// Why: exposes the small startup-scope membership helper to top-level tests.
|
||||||
|
func TestHookContainsNode(entries []string, needle string) bool {
|
||||||
|
return containsNode(entries, needle)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestHookStartupNodeStrictlyRequired runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool.
|
||||||
|
// Why: exposes strict-node startup scoping so outage-recovery tests can confirm
|
||||||
|
// non-core nodes stop blocking bootstrap.
|
||||||
|
func (o *Orchestrator) TestHookStartupNodeStrictlyRequired(node string) bool {
|
||||||
|
return o.startupNodeStrictlyRequired(node)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestHookStartupRequiredFluxKustomizations runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{}.
|
||||||
|
// Why: exposes flux startup scoping so top-level tests can confirm only core
|
||||||
|
// kustomizations block emergency bootstrap.
|
||||||
|
func (o *Orchestrator) TestHookStartupRequiredFluxKustomizations() map[string]struct{} {
|
||||||
|
return o.startupRequiredFluxKustomizations()
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestHookStartupRequiredWorkloadNamespaces runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{}.
|
||||||
|
// Why: exposes workload namespace startup scoping so top-level tests can
|
||||||
|
// confirm only core workloads block emergency bootstrap.
|
||||||
|
func (o *Orchestrator) TestHookStartupRequiredWorkloadNamespaces() map[string]struct{} {
|
||||||
|
return o.startupRequiredWorkloadNamespaces()
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestHookMaybeRunEarlyVaultUnseal runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context).
|
||||||
|
// Why: exposes the early startup Vault deferral helper to top-level tests.
|
||||||
|
func (o *Orchestrator) TestHookMaybeRunEarlyVaultUnseal(ctx context.Context) {
|
||||||
|
o.maybeRunEarlyVaultUnseal(ctx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestHookRunStartupVaultUnsealGate runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error.
|
||||||
|
// Why: exposes the startup Vault gate helper to top-level tests.
|
||||||
|
func (o *Orchestrator) TestHookRunStartupVaultUnsealGate(ctx context.Context) error {
|
||||||
|
return o.runStartupVaultUnsealGate(ctx)
|
||||||
|
}
|
||||||
@ -33,6 +33,9 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
||||||
c.Startup.NodeInventoryReachPollSeconds = 5
|
c.Startup.NodeInventoryReachPollSeconds = 5
|
||||||
}
|
}
|
||||||
|
if c.Startup.NodeInventoryReachRequiredNodes == nil {
|
||||||
|
c.Startup.NodeInventoryReachRequiredNodes = []string{}
|
||||||
|
}
|
||||||
if c.Startup.RequiredNodeLabels == nil {
|
if c.Startup.RequiredNodeLabels == nil {
|
||||||
c.Startup.RequiredNodeLabels = map[string]map[string]string{
|
c.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||||
"titan-09": {
|
"titan-09": {
|
||||||
@ -121,7 +124,11 @@ func (c *Config) applyDefaults() {
|
|||||||
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
|
if strings.TrimSpace(c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey) == "" {
|
||||||
c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
|
c.Startup.ServiceChecklistAuth.AdminSecretPasswordKey = "password"
|
||||||
}
|
}
|
||||||
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
|
if c.Startup.ServiceChecklistExplicitOnly {
|
||||||
|
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, []ServiceChecklistCheck{})
|
||||||
|
} else {
|
||||||
|
c.Startup.ServiceChecklist = mergeServiceChecklistDefaults(c.Startup.ServiceChecklist, defaultServiceChecklist())
|
||||||
|
}
|
||||||
for i := range c.Startup.ServiceChecklist {
|
for i := range c.Startup.ServiceChecklist {
|
||||||
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
|
if c.Startup.ServiceChecklist[i].TimeoutSeconds <= 0 {
|
||||||
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
|
c.Startup.ServiceChecklist[i].TimeoutSeconds = 12
|
||||||
@ -152,12 +159,18 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
||||||
c.Startup.NodeSSHAuthPollSeconds = 5
|
c.Startup.NodeSSHAuthPollSeconds = 5
|
||||||
}
|
}
|
||||||
|
if c.Startup.NodeSSHAuthRequiredNodes == nil {
|
||||||
|
c.Startup.NodeSSHAuthRequiredNodes = []string{}
|
||||||
|
}
|
||||||
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||||
c.Startup.FluxHealthWaitSeconds = 900
|
c.Startup.FluxHealthWaitSeconds = 900
|
||||||
}
|
}
|
||||||
if c.Startup.FluxHealthPollSeconds <= 0 {
|
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||||
c.Startup.FluxHealthPollSeconds = 5
|
c.Startup.FluxHealthPollSeconds = 5
|
||||||
}
|
}
|
||||||
|
if c.Startup.FluxHealthRequiredKustomizations == nil {
|
||||||
|
c.Startup.FluxHealthRequiredKustomizations = []string{}
|
||||||
|
}
|
||||||
if c.Startup.IgnoreFluxKustomizations == nil {
|
if c.Startup.IgnoreFluxKustomizations == nil {
|
||||||
c.Startup.IgnoreFluxKustomizations = []string{}
|
c.Startup.IgnoreFluxKustomizations = []string{}
|
||||||
}
|
}
|
||||||
@ -167,6 +180,9 @@ func (c *Config) applyDefaults() {
|
|||||||
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||||
c.Startup.WorkloadConvergencePollSeconds = 5
|
c.Startup.WorkloadConvergencePollSeconds = 5
|
||||||
}
|
}
|
||||||
|
if c.Startup.WorkloadConvergenceRequiredNamespaces == nil {
|
||||||
|
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{}
|
||||||
|
}
|
||||||
if c.Startup.IgnoreWorkloadNamespaces == nil {
|
if c.Startup.IgnoreWorkloadNamespaces == nil {
|
||||||
c.Startup.IgnoreWorkloadNamespaces = []string{}
|
c.Startup.IgnoreWorkloadNamespaces = []string{}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -39,24 +39,25 @@ func defaults() Config {
|
|||||||
"maintenance",
|
"maintenance",
|
||||||
},
|
},
|
||||||
Startup: Startup{
|
Startup: Startup{
|
||||||
APIWaitSeconds: 1200,
|
APIWaitSeconds: 1200,
|
||||||
APIPollSeconds: 2,
|
APIPollSeconds: 2,
|
||||||
ShutdownCooldownSeconds: 45,
|
ShutdownCooldownSeconds: 45,
|
||||||
RequireNodeInventoryReach: true,
|
RequireNodeInventoryReach: true,
|
||||||
NodeInventoryReachWaitSeconds: 300,
|
NodeInventoryReachWaitSeconds: 300,
|
||||||
NodeInventoryReachPollSeconds: 5,
|
NodeInventoryReachPollSeconds: 5,
|
||||||
RequireTimeSync: true,
|
NodeInventoryReachRequiredNodes: []string{},
|
||||||
TimeSyncWaitSeconds: 240,
|
RequireTimeSync: true,
|
||||||
TimeSyncPollSeconds: 5,
|
TimeSyncWaitSeconds: 240,
|
||||||
TimeSyncMode: "quorum",
|
TimeSyncPollSeconds: 5,
|
||||||
TimeSyncQuorum: 2,
|
TimeSyncMode: "quorum",
|
||||||
ReconcileAccessOnBoot: true,
|
TimeSyncQuorum: 2,
|
||||||
AutoEtcdRestoreOnAPIFailure: true,
|
ReconcileAccessOnBoot: true,
|
||||||
EtcdRestoreControlPlane: "titan-0a",
|
AutoEtcdRestoreOnAPIFailure: true,
|
||||||
RequireStorageReady: true,
|
EtcdRestoreControlPlane: "titan-0a",
|
||||||
StorageReadyWaitSeconds: 420,
|
RequireStorageReady: true,
|
||||||
StorageReadyPollSeconds: 5,
|
StorageReadyWaitSeconds: 420,
|
||||||
StorageMinReadyNodes: 2,
|
StorageReadyPollSeconds: 5,
|
||||||
|
StorageMinReadyNodes: 2,
|
||||||
StorageCriticalPVCs: []string{
|
StorageCriticalPVCs: []string{
|
||||||
"vault/data-vault-0",
|
"vault/data-vault-0",
|
||||||
"postgres/postgres-data-postgres-0",
|
"postgres/postgres-data-postgres-0",
|
||||||
@ -91,33 +92,36 @@ func defaults() Config {
|
|||||||
AdminSecretUsernameKey: "username",
|
AdminSecretUsernameKey: "username",
|
||||||
AdminSecretPasswordKey: "password",
|
AdminSecretPasswordKey: "password",
|
||||||
},
|
},
|
||||||
ServiceChecklist: defaultServiceChecklist(),
|
ServiceChecklist: defaultServiceChecklist(),
|
||||||
RequireCriticalServiceEndpoints: true,
|
RequireCriticalServiceEndpoints: true,
|
||||||
CriticalServiceEndpointWaitSec: 420,
|
CriticalServiceEndpointWaitSec: 420,
|
||||||
CriticalServiceEndpointPollSec: 5,
|
CriticalServiceEndpointPollSec: 5,
|
||||||
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
|
CriticalServiceEndpoints: defaultCriticalServiceEndpoints(),
|
||||||
RequireIngressChecklist: true,
|
RequireIngressChecklist: true,
|
||||||
IngressChecklistWaitSeconds: 420,
|
IngressChecklistWaitSeconds: 420,
|
||||||
IngressChecklistPollSeconds: 5,
|
IngressChecklistPollSeconds: 5,
|
||||||
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
|
IngressChecklistAccepted: []int{200, 301, 302, 307, 308, 401, 403, 404},
|
||||||
IngressChecklistIgnoreHosts: []string{},
|
IngressChecklistIgnoreHosts: []string{},
|
||||||
RequireNodeSSHAuth: true,
|
RequireNodeSSHAuth: true,
|
||||||
NodeSSHAuthWaitSeconds: 240,
|
NodeSSHAuthWaitSeconds: 240,
|
||||||
NodeSSHAuthPollSeconds: 5,
|
NodeSSHAuthPollSeconds: 5,
|
||||||
RequireFluxHealth: true,
|
NodeSSHAuthRequiredNodes: []string{},
|
||||||
FluxHealthWaitSeconds: 900,
|
RequireFluxHealth: true,
|
||||||
FluxHealthPollSeconds: 5,
|
FluxHealthWaitSeconds: 900,
|
||||||
IgnoreFluxKustomizations: []string{},
|
FluxHealthPollSeconds: 5,
|
||||||
RequireWorkloadConvergence: true,
|
FluxHealthRequiredKustomizations: []string{},
|
||||||
WorkloadConvergenceWaitSeconds: 900,
|
IgnoreFluxKustomizations: []string{},
|
||||||
WorkloadConvergencePollSeconds: 5,
|
RequireWorkloadConvergence: true,
|
||||||
IgnoreWorkloadNamespaces: []string{},
|
WorkloadConvergenceWaitSeconds: 900,
|
||||||
IgnoreWorkloads: []string{},
|
WorkloadConvergencePollSeconds: 5,
|
||||||
IgnoreUnavailableNodes: []string{},
|
WorkloadConvergenceRequiredNamespaces: []string{},
|
||||||
AutoRecycleStuckPods: true,
|
IgnoreWorkloadNamespaces: []string{},
|
||||||
StuckPodGraceSeconds: 180,
|
IgnoreWorkloads: []string{},
|
||||||
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
|
IgnoreUnavailableNodes: []string{},
|
||||||
VaultUnsealBreakglassTimeout: 15,
|
AutoRecycleStuckPods: true,
|
||||||
|
StuckPodGraceSeconds: 180,
|
||||||
|
VaultUnsealKeyFile: "/var/lib/ananke/vault-unseal.key",
|
||||||
|
VaultUnsealBreakglassTimeout: 15,
|
||||||
},
|
},
|
||||||
Shutdown: Shutdown{
|
Shutdown: Shutdown{
|
||||||
DefaultBudgetSeconds: 1380,
|
DefaultBudgetSeconds: 1380,
|
||||||
|
|||||||
@ -51,3 +51,41 @@ startup:
|
|||||||
t.Fatalf("expected validation failure")
|
t.Fatalf("expected validation failure")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestLoadKeepsExplicitServiceChecklist runs one orchestration or CLI step.
|
||||||
|
// Signature: TestLoadKeepsExplicitServiceChecklist(t *testing.T).
|
||||||
|
// Why: host recovery configs must be able to keep a narrow, explicit checklist
|
||||||
|
// without silently inheriting the full default service catalog.
|
||||||
|
func TestLoadKeepsExplicitServiceChecklist(t *testing.T) {
|
||||||
|
cfgPath := filepath.Join(t.TempDir(), "ananke.yaml")
|
||||||
|
raw := `
|
||||||
|
control_planes: [titan-0a]
|
||||||
|
expected_flux_branch: main
|
||||||
|
expected_flux_source_url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
|
||||||
|
iac_repo_path: /opt/titan-iac
|
||||||
|
startup:
|
||||||
|
service_checklist_explicit_only: true
|
||||||
|
service_checklist:
|
||||||
|
- name: gitea-api
|
||||||
|
url: https://scm.bstein.dev/api/healthz
|
||||||
|
accepted_statuses: [200]
|
||||||
|
body_contains: pass
|
||||||
|
timeout_seconds: 12
|
||||||
|
ups:
|
||||||
|
enabled: false
|
||||||
|
`
|
||||||
|
if err := os.WriteFile(cfgPath, []byte(raw), 0o644); err != nil {
|
||||||
|
t.Fatalf("write config: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg, err := Load(cfgPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("load config: %v", err)
|
||||||
|
}
|
||||||
|
if len(cfg.Startup.ServiceChecklist) != 1 {
|
||||||
|
t.Fatalf("expected 1 explicit service check, got %d", len(cfg.Startup.ServiceChecklist))
|
||||||
|
}
|
||||||
|
if cfg.Startup.ServiceChecklist[0].Name != "gitea-api" {
|
||||||
|
t.Fatalf("expected explicit gitea-api check, got %q", cfg.Startup.ServiceChecklist[0].Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -27,65 +27,70 @@ type Config struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Startup struct {
|
type Startup struct {
|
||||||
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
APIWaitSeconds int `yaml:"api_wait_seconds"`
|
||||||
APIPollSeconds int `yaml:"api_poll_seconds"`
|
APIPollSeconds int `yaml:"api_poll_seconds"`
|
||||||
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
|
ShutdownCooldownSeconds int `yaml:"shutdown_cooldown_seconds"`
|
||||||
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
|
MinimumBatteryPercent float64 `yaml:"minimum_battery_percent"`
|
||||||
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
|
RequireNodeInventoryReach bool `yaml:"require_node_inventory_reachability"`
|
||||||
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
|
NodeInventoryReachWaitSeconds int `yaml:"node_inventory_reachability_wait_seconds"`
|
||||||
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
|
NodeInventoryReachPollSeconds int `yaml:"node_inventory_reachability_poll_seconds"`
|
||||||
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
NodeInventoryReachRequiredNodes []string `yaml:"node_inventory_reachability_required_nodes"`
|
||||||
RequireTimeSync bool `yaml:"require_time_sync"`
|
RequiredNodeLabels map[string]map[string]string `yaml:"required_node_labels"`
|
||||||
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
RequireTimeSync bool `yaml:"require_time_sync"`
|
||||||
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
TimeSyncWaitSeconds int `yaml:"time_sync_wait_seconds"`
|
||||||
TimeSyncMode string `yaml:"time_sync_mode"`
|
TimeSyncPollSeconds int `yaml:"time_sync_poll_seconds"`
|
||||||
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
TimeSyncMode string `yaml:"time_sync_mode"`
|
||||||
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
TimeSyncQuorum int `yaml:"time_sync_quorum"`
|
||||||
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
ReconcileAccessOnBoot bool `yaml:"reconcile_access_on_boot"`
|
||||||
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
AutoEtcdRestoreOnAPIFailure bool `yaml:"auto_etcd_restore_on_api_failure"`
|
||||||
RequireStorageReady bool `yaml:"require_storage_ready"`
|
EtcdRestoreControlPlane string `yaml:"etcd_restore_control_plane"`
|
||||||
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
RequireStorageReady bool `yaml:"require_storage_ready"`
|
||||||
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
StorageReadyWaitSeconds int `yaml:"storage_ready_wait_seconds"`
|
||||||
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
StorageReadyPollSeconds int `yaml:"storage_ready_poll_seconds"`
|
||||||
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
StorageMinReadyNodes int `yaml:"storage_min_ready_nodes"`
|
||||||
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
StorageCriticalPVCs []string `yaml:"storage_critical_pvcs"`
|
||||||
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
RequirePostStartProbes bool `yaml:"require_post_start_probes"`
|
||||||
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
PostStartProbeWaitSeconds int `yaml:"post_start_probe_wait_seconds"`
|
||||||
PostStartProbes []string `yaml:"post_start_probes"`
|
PostStartProbePollSeconds int `yaml:"post_start_probe_poll_seconds"`
|
||||||
RequireServiceChecklist bool `yaml:"require_service_checklist"`
|
PostStartProbes []string `yaml:"post_start_probes"`
|
||||||
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
|
RequireServiceChecklist bool `yaml:"require_service_checklist"`
|
||||||
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
ServiceChecklistWaitSeconds int `yaml:"service_checklist_wait_seconds"`
|
||||||
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
ServiceChecklistPollSeconds int `yaml:"service_checklist_poll_seconds"`
|
||||||
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
|
ServiceChecklistStabilitySec int `yaml:"service_checklist_stability_seconds"`
|
||||||
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
ServiceChecklistAuth ServiceChecklistAuthSettings `yaml:"service_checklist_auth"`
|
||||||
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
|
ServiceChecklistExplicitOnly bool `yaml:"service_checklist_explicit_only"`
|
||||||
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
|
ServiceChecklist []ServiceChecklistCheck `yaml:"service_checklist"`
|
||||||
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
|
RequireCriticalServiceEndpoints bool `yaml:"require_critical_service_endpoints"`
|
||||||
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
|
CriticalServiceEndpointWaitSec int `yaml:"critical_service_endpoint_wait_seconds"`
|
||||||
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
|
CriticalServiceEndpointPollSec int `yaml:"critical_service_endpoint_poll_seconds"`
|
||||||
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
|
CriticalServiceEndpoints []string `yaml:"critical_service_endpoints"`
|
||||||
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
|
RequireIngressChecklist bool `yaml:"require_ingress_checklist"`
|
||||||
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
|
IngressChecklistWaitSeconds int `yaml:"ingress_checklist_wait_seconds"`
|
||||||
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
|
IngressChecklistPollSeconds int `yaml:"ingress_checklist_poll_seconds"`
|
||||||
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
|
IngressChecklistAccepted []int `yaml:"ingress_checklist_accepted_statuses"`
|
||||||
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
|
IngressChecklistIgnoreHosts []string `yaml:"ingress_checklist_ignore_hosts"`
|
||||||
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
|
IngressChecklistInsecureSkip bool `yaml:"ingress_checklist_insecure_skip_tls"`
|
||||||
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
|
RequireNodeSSHAuth bool `yaml:"require_node_ssh_auth"`
|
||||||
RequireFluxHealth bool `yaml:"require_flux_health"`
|
NodeSSHAuthWaitSeconds int `yaml:"node_ssh_auth_wait_seconds"`
|
||||||
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
NodeSSHAuthPollSeconds int `yaml:"node_ssh_auth_poll_seconds"`
|
||||||
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
NodeSSHAuthRequiredNodes []string `yaml:"node_ssh_auth_required_nodes"`
|
||||||
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
RequireFluxHealth bool `yaml:"require_flux_health"`
|
||||||
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
FluxHealthWaitSeconds int `yaml:"flux_health_wait_seconds"`
|
||||||
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
FluxHealthPollSeconds int `yaml:"flux_health_poll_seconds"`
|
||||||
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
FluxHealthRequiredKustomizations []string `yaml:"flux_health_required_kustomizations"`
|
||||||
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
IgnoreFluxKustomizations []string `yaml:"ignore_flux_kustomizations"`
|
||||||
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
RequireWorkloadConvergence bool `yaml:"require_workload_convergence"`
|
||||||
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
WorkloadConvergenceWaitSeconds int `yaml:"workload_convergence_wait_seconds"`
|
||||||
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
WorkloadConvergencePollSeconds int `yaml:"workload_convergence_poll_seconds"`
|
||||||
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
WorkloadConvergenceRequiredNamespaces []string `yaml:"workload_convergence_required_namespaces"`
|
||||||
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
IgnoreWorkloadNamespaces []string `yaml:"ignore_workload_namespaces"`
|
||||||
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
IgnoreWorkloads []string `yaml:"ignore_workloads"`
|
||||||
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
IgnoreUnavailableNodes []string `yaml:"ignore_unavailable_nodes"`
|
||||||
|
AutoRecycleStuckPods bool `yaml:"auto_recycle_stuck_pods"`
|
||||||
|
StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"`
|
||||||
|
VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"`
|
||||||
|
VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"`
|
||||||
|
VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ServiceChecklistCheck struct {
|
type ServiceChecklistCheck struct {
|
||||||
|
|||||||
@ -61,6 +61,11 @@ func (c Config) Validate() error {
|
|||||||
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
if c.Startup.NodeInventoryReachPollSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
|
return fmt.Errorf("config.startup.node_inventory_reachability_poll_seconds must be > 0")
|
||||||
}
|
}
|
||||||
|
for _, node := range c.Startup.NodeInventoryReachRequiredNodes {
|
||||||
|
if strings.TrimSpace(node) == "" {
|
||||||
|
return fmt.Errorf("config.startup.node_inventory_reachability_required_nodes entries must not be empty")
|
||||||
|
}
|
||||||
|
}
|
||||||
for node, labels := range c.Startup.RequiredNodeLabels {
|
for node, labels := range c.Startup.RequiredNodeLabels {
|
||||||
if strings.TrimSpace(node) == "" {
|
if strings.TrimSpace(node) == "" {
|
||||||
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
|
return fmt.Errorf("config.startup.required_node_labels keys must not be empty")
|
||||||
@ -233,18 +238,37 @@ func (c Config) Validate() error {
|
|||||||
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
if c.Startup.NodeSSHAuthPollSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
|
return fmt.Errorf("config.startup.node_ssh_auth_poll_seconds must be > 0")
|
||||||
}
|
}
|
||||||
|
for _, node := range c.Startup.NodeSSHAuthRequiredNodes {
|
||||||
|
if strings.TrimSpace(node) == "" {
|
||||||
|
return fmt.Errorf("config.startup.node_ssh_auth_required_nodes entries must not be empty")
|
||||||
|
}
|
||||||
|
}
|
||||||
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
if c.Startup.FluxHealthWaitSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
|
return fmt.Errorf("config.startup.flux_health_wait_seconds must be > 0")
|
||||||
}
|
}
|
||||||
if c.Startup.FluxHealthPollSeconds <= 0 {
|
if c.Startup.FluxHealthPollSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
|
return fmt.Errorf("config.startup.flux_health_poll_seconds must be > 0")
|
||||||
}
|
}
|
||||||
|
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
|
||||||
|
item = strings.TrimSpace(item)
|
||||||
|
if item == "" {
|
||||||
|
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must not be empty")
|
||||||
|
}
|
||||||
|
if strings.Count(item, "/") != 1 {
|
||||||
|
return fmt.Errorf("config.startup.flux_health_required_kustomizations entries must be namespace/name, got %q", item)
|
||||||
|
}
|
||||||
|
}
|
||||||
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
if c.Startup.WorkloadConvergenceWaitSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
|
return fmt.Errorf("config.startup.workload_convergence_wait_seconds must be > 0")
|
||||||
}
|
}
|
||||||
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
if c.Startup.WorkloadConvergencePollSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
|
return fmt.Errorf("config.startup.workload_convergence_poll_seconds must be > 0")
|
||||||
}
|
}
|
||||||
|
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
|
||||||
|
if strings.TrimSpace(ns) == "" {
|
||||||
|
return fmt.Errorf("config.startup.workload_convergence_required_namespaces entries must not be empty")
|
||||||
|
}
|
||||||
|
}
|
||||||
if c.Startup.StuckPodGraceSeconds <= 0 {
|
if c.Startup.StuckPodGraceSeconds <= 0 {
|
||||||
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
|
return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0")
|
||||||
}
|
}
|
||||||
@ -277,6 +301,16 @@ func (c Config) Validate() error {
|
|||||||
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
|
return fmt.Errorf("config.startup.ignore_workload_namespaces entries must not be empty")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for _, item := range c.Startup.FluxHealthRequiredKustomizations {
|
||||||
|
if containsTrimmed(c.Startup.IgnoreFluxKustomizations, item) {
|
||||||
|
return fmt.Errorf("config.startup.flux_health_required_kustomizations must not overlap ignore_flux_kustomizations (%q)", strings.TrimSpace(item))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, ns := range c.Startup.WorkloadConvergenceRequiredNamespaces {
|
||||||
|
if containsTrimmed(c.Startup.IgnoreWorkloadNamespaces, ns) {
|
||||||
|
return fmt.Errorf("config.startup.workload_convergence_required_namespaces must not overlap ignore_workload_namespaces (%q)", strings.TrimSpace(ns))
|
||||||
|
}
|
||||||
|
}
|
||||||
for _, node := range c.Startup.IgnoreUnavailableNodes {
|
for _, node := range c.Startup.IgnoreUnavailableNodes {
|
||||||
if strings.TrimSpace(node) == "" {
|
if strings.TrimSpace(node) == "" {
|
||||||
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
|
return fmt.Errorf("config.startup.ignore_unavailable_nodes entries must not be empty")
|
||||||
@ -328,3 +362,20 @@ func (c Config) Validate() error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// containsTrimmed runs one orchestration or CLI step.
|
||||||
|
// Signature: containsTrimmed(entries []string, needle string) bool.
|
||||||
|
// Why: startup config now supports both required and ignored recovery scopes, so
|
||||||
|
// validation needs a single normalized overlap check for those lists.
|
||||||
|
func containsTrimmed(entries []string, needle string) bool {
|
||||||
|
needle = strings.TrimSpace(needle)
|
||||||
|
if needle == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, entry := range entries {
|
||||||
|
if strings.TrimSpace(entry) == needle {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|||||||
@ -30,6 +30,7 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
|||||||
{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
|
{"bad_api_poll", func(c *Config) { c.Startup.APIPollSeconds = 0 }},
|
||||||
{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
|
{"bad_min_battery_percent", func(c *Config) { c.Startup.MinimumBatteryPercent = 101 }},
|
||||||
{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
|
{"bad_node_inventory_poll", func(c *Config) { c.Startup.NodeInventoryReachPollSeconds = 0 }},
|
||||||
|
{"bad_empty_node_inventory_required_node", func(c *Config) { c.Startup.NodeInventoryReachRequiredNodes = []string{"titan-0a", ""} }},
|
||||||
{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
|
{"bad_time_sync_wait", func(c *Config) { c.Startup.TimeSyncWaitSeconds = 0 }},
|
||||||
{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
|
{"bad_time_sync_poll", func(c *Config) { c.Startup.TimeSyncPollSeconds = 0 }},
|
||||||
{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
|
{"bad_time_sync_quorum", func(c *Config) { c.Startup.TimeSyncMode = "quorum"; c.Startup.TimeSyncQuorum = 0 }},
|
||||||
@ -68,15 +69,27 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) {
|
|||||||
{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
|
{"bad_ingress_ignore_entry", func(c *Config) { c.Startup.IngressChecklistIgnoreHosts = []string{"", "grafana.bstein.dev"} }},
|
||||||
{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
|
{"bad_node_ssh_wait", func(c *Config) { c.Startup.NodeSSHAuthWaitSeconds = 0 }},
|
||||||
{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
|
{"bad_node_ssh_poll", func(c *Config) { c.Startup.NodeSSHAuthPollSeconds = 0 }},
|
||||||
|
{"bad_empty_node_ssh_required_node", func(c *Config) { c.Startup.NodeSSHAuthRequiredNodes = []string{"titan-0a", ""} }},
|
||||||
{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
|
{"bad_flux_wait", func(c *Config) { c.Startup.FluxHealthWaitSeconds = 0 }},
|
||||||
{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
|
{"bad_flux_poll", func(c *Config) { c.Startup.FluxHealthPollSeconds = 0 }},
|
||||||
|
{"bad_empty_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", ""} }},
|
||||||
|
{"bad_malformed_flux_required_kustomization", func(c *Config) { c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system-core"} }},
|
||||||
{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
|
{"bad_workload_wait", func(c *Config) { c.Startup.WorkloadConvergenceWaitSeconds = 0 }},
|
||||||
{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
|
{"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }},
|
||||||
|
{"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }},
|
||||||
{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
|
{"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }},
|
||||||
{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
|
{"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }},
|
||||||
{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
|
{"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }},
|
||||||
{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
|
{"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }},
|
||||||
{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
|
{"bad_empty_ignore_workload_namespaces_entry", func(c *Config) { c.Startup.IgnoreWorkloadNamespaces = []string{"", "vault"} }},
|
||||||
|
{"bad_overlap_flux_required_and_ignored", func(c *Config) {
|
||||||
|
c.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core"}
|
||||||
|
c.Startup.IgnoreFluxKustomizations = []string{"flux-system/core"}
|
||||||
|
}},
|
||||||
|
{"bad_overlap_workload_required_and_ignored", func(c *Config) {
|
||||||
|
c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
|
||||||
|
c.Startup.IgnoreWorkloadNamespaces = []string{"monitoring"}
|
||||||
|
}},
|
||||||
{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
|
{"bad_empty_ignore_unavailable_nodes_entry", func(c *Config) { c.Startup.IgnoreUnavailableNodes = []string{"", "titan-22"} }},
|
||||||
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
|
{"bad_empty_vault_key_file", func(c *Config) { c.Startup.VaultUnsealKeyFile = "" }},
|
||||||
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
|
{"bad_ssh_port_low", func(c *Config) { c.SSHPort = 0 }},
|
||||||
@ -121,6 +134,10 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) {
|
|||||||
if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
|
if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" {
|
||||||
t.Fatalf("expected startup defaults to be set")
|
t.Fatalf("expected startup defaults to be set")
|
||||||
}
|
}
|
||||||
|
if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil ||
|
||||||
|
cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil {
|
||||||
|
t.Fatalf("expected startup recovery scope slices to be initialized")
|
||||||
|
}
|
||||||
if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
|
if cfg.Startup.CriticalServiceEndpointWaitSec <= 0 || cfg.Startup.CriticalServiceEndpointPollSec <= 0 {
|
||||||
t.Fatalf("expected critical service endpoint timing defaults to be set")
|
t.Fatalf("expected critical service endpoint timing defaults to be set")
|
||||||
}
|
}
|
||||||
|
|||||||
@ -79,6 +79,29 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("flux-health-required-kustomizations-ignore-optional-failures", func(t *testing.T) {
|
||||||
|
cfg := lifecycleConfig(t)
|
||||||
|
cfg.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/monitoring"}
|
||||||
|
|
||||||
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
switch {
|
||||||
|
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
|
||||||
|
return `{"items":[
|
||||||
|
{"metadata":{"namespace":"flux-system","name":"monitoring"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}},
|
||||||
|
{"metadata":{"namespace":"flux-system","name":"jellyfin"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"optional still down"}]}}
|
||||||
|
]}`, nil
|
||||||
|
default:
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||||
|
ready, detail, err := orch.TestHookFluxHealthReady(context.Background())
|
||||||
|
if err != nil || !ready || !strings.Contains(detail, "required kustomizations ready=") {
|
||||||
|
t.Fatalf("expected required flux kustomization scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
|
t.Run("workload-convergence-and-recycle-branches", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
||||||
@ -145,6 +168,42 @@ func TestHookFluxAndWorkloadMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("required-workload-namespaces-ignore-optional-failure-pods", func(t *testing.T) {
|
||||||
|
cfg := lifecycleConfig(t)
|
||||||
|
cfg.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring"}
|
||||||
|
cfg.Startup.StuckPodGraceSeconds = 1
|
||||||
|
|
||||||
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
switch {
|
||||||
|
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
|
||||||
|
return `{"items":[
|
||||||
|
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":1}},
|
||||||
|
{"kind":"Deployment","metadata":{"namespace":"jellyfin","name":"pegasus"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
|
||||||
|
]}`, nil
|
||||||
|
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||||||
|
return `{"items":[
|
||||||
|
{"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"grafana"}]},"status":{"containerStatuses":[{"state":{"running":{}}}]}},
|
||||||
|
{"metadata":{"namespace":"jellyfin","name":"pegasus-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"pegasus"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"pegasus"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}
|
||||||
|
]}`, nil
|
||||||
|
default:
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||||
|
ready, detail, err := orch.TestHookWorkloadConvergenceReady(context.Background())
|
||||||
|
if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
|
||||||
|
t.Fatalf("expected required workload namespace scope to pass, ready=%v detail=%q err=%v", ready, detail, err)
|
||||||
|
}
|
||||||
|
failures, err := orch.TestHookStartupFailurePods(context.Background())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("startup failure pod query: %v", err)
|
||||||
|
}
|
||||||
|
if len(failures) != 0 {
|
||||||
|
t.Fatalf("expected optional namespace failures to be ignored, got %v", failures)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
|
t.Run("critical-workload-replica-heal-branches", func(t *testing.T) {
|
||||||
cfg := lifecycleConfig(t)
|
cfg := lifecycleConfig(t)
|
||||||
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
|||||||
@ -53,6 +53,48 @@ func TestHookIngressServiceMatrix(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("required-node-labels-skip-ignored-unavailable-nodes", func(t *testing.T) {
|
||||||
|
cfg := lifecycleConfig(t)
|
||||||
|
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||||
|
"titan-09": {
|
||||||
|
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-09"}
|
||||||
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
|
||||||
|
t.Fatalf("expected ignored unavailable node labels to be skipped, got %q", command)
|
||||||
|
}
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||||
|
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
|
||||||
|
t.Fatalf("expected ignored unavailable node label enforcement to be skipped, got %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("required-node-labels-skip-absent-non-core-nodes", func(t *testing.T) {
|
||||||
|
cfg := lifecycleConfig(t)
|
||||||
|
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
||||||
|
"titan-09": {
|
||||||
|
"ananke.bstein.dev/harbor-bootstrap": "true",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
|
||||||
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
if name == "kubectl" && strings.Contains(command, "label node titan-09 --overwrite") {
|
||||||
|
return "", errors.New("Error from server (NotFound): nodes \"titan-09\" not found")
|
||||||
|
}
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||||
|
if err := orch.TestHookEnsureRequiredNodeLabels(context.Background()); err != nil {
|
||||||
|
t.Fatalf("expected absent non-core node label enforcement to be skipped, got %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
|
t.Run("ingress-discovery-checklist-and-heal", func(t *testing.T) {
|
||||||
tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
tlsServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||||
w.WriteHeader(http.StatusOK)
|
w.WriteHeader(http.StatusOK)
|
||||||
|
|||||||
@ -633,7 +633,7 @@ func TestHookLifecycleCleanupRemainingClosure(t *testing.T) {
|
|||||||
switch {
|
switch {
|
||||||
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||||
apiVersionCalls++
|
apiVersionCalls++
|
||||||
if apiVersionCalls == 1 {
|
if apiVersionCalls <= 2 {
|
||||||
return "", errors.New("api down")
|
return "", errors.New("api down")
|
||||||
}
|
}
|
||||||
return "v1.31.0", nil
|
return "v1.31.0", nil
|
||||||
|
|||||||
222
testing/orchestrator/hooks_startup_scope_vault_test.go
Normal file
222
testing/orchestrator/hooks_startup_scope_vault_test.go
Normal file
@ -0,0 +1,222 @@
|
|||||||
|
package orchestrator
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
||||||
|
)
|
||||||
|
|
||||||
|
// readStartupProgress runs one orchestration or CLI step.
|
||||||
|
// Signature: readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string.
|
||||||
|
// Why: startup helper tests need to inspect progress artifacts without reaching
|
||||||
|
// into internal package state from the top-level testing module.
|
||||||
|
func readStartupProgress(t *testing.T, orch *cluster.Orchestrator) string {
|
||||||
|
t.Helper()
|
||||||
|
payload, err := os.ReadFile(orch.TestHookStartupProgressPath())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read startup progress: %v", err)
|
||||||
|
}
|
||||||
|
return string(payload)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestHookStartupScopeAndVaultHelpers runs one orchestration or CLI step.
|
||||||
|
// Signature: TestHookStartupScopeAndVaultHelpers(t *testing.T).
|
||||||
|
// Why: keeps startup-scope and startup-Vault helper branches covered from the
|
||||||
|
// split top-level testing module required by the repo hygiene contract.
|
||||||
|
func TestHookStartupScopeAndVaultHelpers(t *testing.T) {
|
||||||
|
t.Run("startup-scope-helpers", func(t *testing.T) {
|
||||||
|
nodes := []string{"titan-db", " titan-23 ", "", "titan-24"}
|
||||||
|
if got := cluster.TestHookStartupRequiredNodes(nodes, nil); len(got) != len(nodes) {
|
||||||
|
t.Fatalf("expected passthrough node list, got %v", got)
|
||||||
|
}
|
||||||
|
got := cluster.TestHookStartupRequiredNodes(nodes, []string{"titan-db", "titan-24"})
|
||||||
|
if len(got) != 2 || got[0] != "titan-db" || got[1] != "titan-24" {
|
||||||
|
t.Fatalf("unexpected filtered node list: %v", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !cluster.TestHookContainsNode([]string{"titan-db", " titan-23 "}, " titan-23 ") {
|
||||||
|
t.Fatalf("expected trimmed node membership match")
|
||||||
|
}
|
||||||
|
if cluster.TestHookContainsNode([]string{"titan-db"}, " ") {
|
||||||
|
t.Fatalf("expected blank node probe to be ignored")
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg := lifecycleConfig(t)
|
||||||
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
||||||
|
if !orch.TestHookStartupNodeStrictlyRequired("titan-23") {
|
||||||
|
t.Fatalf("expected all nodes to be strict when no recovery scopes are configured")
|
||||||
|
}
|
||||||
|
|
||||||
|
cfgScoped := lifecycleConfig(t)
|
||||||
|
cfgScoped.Startup.NodeInventoryReachRequiredNodes = []string{"titan-23"}
|
||||||
|
cfgScoped.Startup.NodeSSHAuthRequiredNodes = []string{"titan-24"}
|
||||||
|
cfgScoped.Startup.FluxHealthRequiredKustomizations = []string{"flux-system/core", " flux-system/gitea "}
|
||||||
|
cfgScoped.Startup.WorkloadConvergenceRequiredNamespaces = []string{"vault", " monitoring "}
|
||||||
|
orchScoped, _ := newHookOrchestrator(t, cfgScoped, nil, nil)
|
||||||
|
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-db") {
|
||||||
|
t.Fatalf("expected control plane to remain strict")
|
||||||
|
}
|
||||||
|
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-23") {
|
||||||
|
t.Fatalf("expected inventory-scoped node to remain strict")
|
||||||
|
}
|
||||||
|
if !orchScoped.TestHookStartupNodeStrictlyRequired("titan-24") {
|
||||||
|
t.Fatalf("expected ssh-scoped node to remain strict")
|
||||||
|
}
|
||||||
|
if orchScoped.TestHookStartupNodeStrictlyRequired("titan-25") {
|
||||||
|
t.Fatalf("expected non-core worker to stop being strict")
|
||||||
|
}
|
||||||
|
|
||||||
|
flux := orchScoped.TestHookStartupRequiredFluxKustomizations()
|
||||||
|
if _, ok := flux["flux-system/core"]; !ok {
|
||||||
|
t.Fatalf("expected core flux kustomization in required set: %v", flux)
|
||||||
|
}
|
||||||
|
if _, ok := flux["flux-system/gitea"]; !ok {
|
||||||
|
t.Fatalf("expected trimmed gitea kustomization in required set: %v", flux)
|
||||||
|
}
|
||||||
|
|
||||||
|
namespaces := orchScoped.TestHookStartupRequiredWorkloadNamespaces()
|
||||||
|
if _, ok := namespaces["vault"]; !ok {
|
||||||
|
t.Fatalf("expected vault namespace in required set: %v", namespaces)
|
||||||
|
}
|
||||||
|
if _, ok := namespaces["monitoring"]; !ok {
|
||||||
|
t.Fatalf("expected trimmed monitoring namespace in required set: %v", namespaces)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("startup-vault-helpers", func(t *testing.T) {
|
||||||
|
t.Run("early-vault-unseal-paths", func(t *testing.T) {
|
||||||
|
cfgAPI := lifecycleConfig(t)
|
||||||
|
runAPI := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
|
||||||
|
return "", errors.New("api down")
|
||||||
|
}
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
orchAPI, _ := newHookOrchestrator(t, cfgAPI, runAPI, runAPI)
|
||||||
|
orchAPI.TestHookBeginStartupReport("startup-vault")
|
||||||
|
orchAPI.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||||
|
if payload := readStartupProgress(t, orchAPI); strings.Contains(payload, "vault-unseal-early") {
|
||||||
|
t.Fatalf("expected no early vault check when api is unavailable, payload=%s", payload)
|
||||||
|
}
|
||||||
|
|
||||||
|
cfgErr := lifecycleConfig(t)
|
||||||
|
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
switch {
|
||||||
|
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||||
|
return "v1.31.0", nil
|
||||||
|
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||||
|
return "", errors.New("phase probe failed")
|
||||||
|
default:
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
|
||||||
|
orchErr.TestHookBeginStartupReport("startup-vault")
|
||||||
|
orchErr.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||||
|
if payload := readStartupProgress(t, orchErr); !strings.Contains(payload, "deferred early vault unseal") {
|
||||||
|
t.Fatalf("expected early vault auto-heal detail, payload=%s", payload)
|
||||||
|
}
|
||||||
|
|
||||||
|
cfgDeferred := lifecycleConfig(t)
|
||||||
|
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
switch {
|
||||||
|
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||||
|
return "v1.31.0", nil
|
||||||
|
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||||
|
return "Pending", nil
|
||||||
|
default:
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
|
||||||
|
orchDeferred.TestHookBeginStartupReport("startup-vault")
|
||||||
|
orchDeferred.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||||
|
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
|
||||||
|
t.Fatalf("expected deferred early vault detail, payload=%s", payload)
|
||||||
|
}
|
||||||
|
|
||||||
|
cfgSuccess := lifecycleConfig(t)
|
||||||
|
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
switch {
|
||||||
|
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||||||
|
return "v1.31.0", nil
|
||||||
|
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||||
|
return "Running", nil
|
||||||
|
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
|
||||||
|
return `{"sealed":false,"initialized":true}`, nil
|
||||||
|
default:
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
|
||||||
|
orchSuccess.TestHookBeginStartupReport("startup-vault")
|
||||||
|
orchSuccess.TestHookMaybeRunEarlyVaultUnseal(context.Background())
|
||||||
|
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault-unseal-early"`) || !strings.Contains(payload, `"passed"`) {
|
||||||
|
t.Fatalf("expected successful early vault check, payload=%s", payload)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("startup-vault-gate-paths", func(t *testing.T) {
|
||||||
|
cfgErr := lifecycleConfig(t)
|
||||||
|
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
|
||||||
|
return "", errors.New("phase probe failed")
|
||||||
|
}
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
|
||||||
|
orchErr.TestHookBeginStartupReport("startup-vault")
|
||||||
|
if err := orchErr.TestHookRunStartupVaultUnsealGate(context.Background()); err == nil || !strings.Contains(err.Error(), "phase probe failed") {
|
||||||
|
t.Fatalf("expected startup vault gate error, got %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cfgDeferred := lifecycleConfig(t)
|
||||||
|
runDeferred := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
|
||||||
|
return "Pending", nil
|
||||||
|
}
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
orchDeferred, _ := newHookOrchestrator(t, cfgDeferred, runDeferred, runDeferred)
|
||||||
|
orchDeferred.TestHookBeginStartupReport("startup-vault")
|
||||||
|
if err := orchDeferred.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
|
||||||
|
t.Fatalf("expected deferred startup vault gate to succeed, got %v", err)
|
||||||
|
}
|
||||||
|
if payload := readStartupProgress(t, orchDeferred); !strings.Contains(payload, `vault-0 pod phase is \"Pending\"`) {
|
||||||
|
t.Fatalf("expected deferred startup vault detail, payload=%s", payload)
|
||||||
|
}
|
||||||
|
|
||||||
|
cfgSuccess := lifecycleConfig(t)
|
||||||
|
runSuccess := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
switch {
|
||||||
|
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||||||
|
return "Running", nil
|
||||||
|
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
|
||||||
|
return `{"sealed":false,"initialized":true}`, nil
|
||||||
|
default:
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
orchSuccess, _ := newHookOrchestrator(t, cfgSuccess, runSuccess, runSuccess)
|
||||||
|
orchSuccess.TestHookBeginStartupReport("startup-vault")
|
||||||
|
if err := orchSuccess.TestHookRunStartupVaultUnsealGate(context.Background()); err != nil {
|
||||||
|
t.Fatalf("expected successful startup vault gate, got %v", err)
|
||||||
|
}
|
||||||
|
if payload := readStartupProgress(t, orchSuccess); !strings.Contains(payload, `"vault is unsealed"`) {
|
||||||
|
t.Fatalf("expected successful startup vault detail, payload=%s", payload)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
@ -165,6 +165,32 @@ func TestHookWorkloadStorageAccessMatrix(t *testing.T) {
|
|||||||
t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
|
t.Fatalf("expected inventory reachability timeout on unexpected output, got %v", err)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("wait-for-node-gates-only-require-core-nodes", func(t *testing.T) {
|
||||||
|
cfg := lifecycleConfig(t)
|
||||||
|
cfg.Startup.RequireNodeSSHAuth = true
|
||||||
|
cfg.Startup.NodeSSHAuthWaitSeconds = 1
|
||||||
|
cfg.Startup.NodeSSHAuthPollSeconds = 1
|
||||||
|
cfg.Startup.NodeInventoryReachWaitSeconds = 1
|
||||||
|
cfg.Startup.NodeInventoryReachPollSeconds = 1
|
||||||
|
cfg.Startup.NodeInventoryReachRequiredNodes = []string{"titan-db"}
|
||||||
|
cfg.Startup.NodeSSHAuthRequiredNodes = []string{"titan-db"}
|
||||||
|
|
||||||
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||||||
|
command := name + " " + strings.Join(args, " ")
|
||||||
|
if name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "__ANANKE_") {
|
||||||
|
return "", errors.New("no route to host")
|
||||||
|
}
|
||||||
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||||||
|
}
|
||||||
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||||||
|
if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
|
||||||
|
t.Fatalf("expected ssh-auth gate to ignore non-core worker failure, got %v", err)
|
||||||
|
}
|
||||||
|
if err := orch.TestHookWaitForNodeInventoryReachability(context.Background()); err != nil {
|
||||||
|
t.Fatalf("expected inventory reachability gate to ignore non-core worker failure, got %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {
|
t.Run("flux-lifecycle-and-sensitive-run-branches", func(t *testing.T) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user