package cluster import ( "context" "encoding/json" "errors" "fmt" "strings" "time" ) type nodeReadyList struct { Items []struct { Metadata struct { Name string `json:"name"` } `json:"metadata"` Status struct { Conditions []struct { Type string `json:"type"` Status string `json:"status"` } `json:"conditions"` } `json:"status"` } `json:"items"` } type podDeleteList struct { Items []struct { Metadata struct { Namespace string `json:"namespace"` Name string `json:"name"` DeletionTimestamp *time.Time `json:"deletionTimestamp"` } `json:"metadata"` Spec struct { NodeName string `json:"nodeName"` } `json:"spec"` } `json:"items"` } // RunPostStartAutoHeal runs one orchestration or CLI step. // Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error. // Why: gives the long-running daemon a narrow, testable repair entrypoint for // post-start drift without rerunning the full startup flow. func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error { return o.postStartAutoHeal(ctx) } // postStartAutoHeal runs one orchestration or CLI step. // Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error. // Why: centralizes bounded post-start repair actions so recurring outage // patterns only trigger the specific remediation they need. func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error { if o.runner.DryRun { return nil } errs := []string{} requestReconcile := false if err := o.ensureRequiredNodeLabels(ctx); err != nil { errs = append(errs, fmt.Sprintf("required node labels: %v", err)) } vaultRecovered, err := o.autoRecoverSealedVault(ctx) if err != nil { errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err)) } else if vaultRecovered { requestReconcile = true if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil { errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err)) } } cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx) if err != nil { errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err)) } else if cleaned > 0 { requestReconcile = true } if requestReconcile { o.bestEffort("request flux reconcile after post-start auto-heal", func() error { return o.requestFluxReconcile(ctx) }) } if len(errs) > 0 { return errors.New(strings.Join(errs, "; ")) } return nil } // autoRecoverSealedVault runs one orchestration or CLI step. // Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error). // Why: lets the daemon repair a later Vault reseal without waiting for a new // bootstrap run. func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) { if o.runner.DryRun { return false, nil } phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}") if err != nil { if isNotFoundErr(err) { return false, nil } return false, fmt.Errorf("vault pod phase check failed: %w", err) } if strings.TrimSpace(phase) != "Running" { return false, nil } sealed, err := o.vaultSealed(ctx) if err != nil { return false, err } if !sealed { return false, nil } o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery") if err := o.ensureVaultUnsealed(ctx); err != nil { return false, err } return true, nil } // rerunVaultK8sAuthConfigJob runs one orchestration or CLI step. // Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error. // Why: post-unseal Vault recovery needs the auth-config job retriggered so // downstream secret consumers stop carrying stale failures from the sealed window. func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error { if o.runner.DryRun { return nil } jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix()) if _, err := o.kubectl( ctx, 25*time.Second, "-n", "vault", "create", "job", "--from=cronjob/vault-k8s-auth-config", jobName, ); err != nil { return fmt.Errorf("create job %s: %w", jobName, err) } o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName) return nil } // cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step. // Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error). // Why: dead nodes can strand terminating pods indefinitely, so the daemon should // clear only that narrow failure class instead of leaving garbage behind forever. func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) { if o.runner.DryRun { return 0, nil } unavailable, err := o.unavailableNodeSet(ctx) if err != nil { return 0, err } if len(unavailable) == 0 { return 0, nil } out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json") if err != nil { return 0, fmt.Errorf("query pods: %w", err) } var pods podDeleteList if err := json.Unmarshal([]byte(out), &pods); err != nil { return 0, fmt.Errorf("decode pods: %w", err) } grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second now := time.Now() count := 0 for _, item := range pods.Items { if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" { continue } if _, badNode := unavailable[item.Spec.NodeName]; !badNode { continue } if now.Sub(*item.Metadata.DeletionTimestamp) < grace { continue } o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName) if _, err := o.kubectl( ctx, 20*time.Second, "-n", item.Metadata.Namespace, "delete", "pod", item.Metadata.Name, "--grace-period=0", "--force", "--wait=false", ); err != nil && !isNotFoundErr(err) { return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err) } count++ } if count > 0 { o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count) } return count, nil } // unavailableNodeSet runs one orchestration or CLI step. // Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error). // Why: isolates Ready-condition parsing so dead-node cleanup stays targeted. func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) { out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json") if err != nil { return nil, fmt.Errorf("query nodes: %w", err) } var nodes nodeReadyList if err := json.Unmarshal([]byte(out), &nodes); err != nil { return nil, fmt.Errorf("decode nodes: %w", err) } unavailable := map[string]struct{}{} for _, item := range nodes.Items { ready := "" for _, cond := range item.Status.Conditions { if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") { ready = strings.TrimSpace(cond.Status) break } } if ready != "True" { unavailable[item.Metadata.Name] = struct{}{} } } return unavailable, nil } // requestFluxReconcile runs one orchestration or CLI step. // Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error. // Why: post-start repairs need a lightweight way to refresh GitOps health // without reusing the broader startup flux-resume flow. func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error { if o.runner.DryRun { return nil } now := time.Now().UTC().Format(time.RFC3339) if _, err := o.kubectl( ctx, 25*time.Second, "-n", "flux-system", "annotate", "gitrepository", "flux-system", "reconcile.fluxcd.io/requestedAt="+now, "--overwrite", ); err != nil { return fmt.Errorf("annotate flux source reconcile: %w", err) } if _, err := o.kubectl( ctx, 25*time.Second, "-n", "flux-system", "annotate", "kustomizations.kustomize.toolkit.fluxcd.io", "--all", "reconcile.fluxcd.io/requestedAt="+now, "--overwrite", ); err != nil { return fmt.Errorf("annotate flux kustomizations reconcile: %w", err) } if _, err := o.kubectl( ctx, 25*time.Second, "annotate", "--all-namespaces", "helmreleases.helm.toolkit.fluxcd.io", "--all", "reconcile.fluxcd.io/requestedAt="+now, "--overwrite", ); err != nil { o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err) } if o.runOverride == nil && o.runner.CommandExists("flux") { if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil { o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err) } } return nil }