ananke/internal/cluster/orchestrator_autorepair.go

289 lines
8.6 KiB
Go
Raw Normal View History

package cluster
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
)
type nodeReadyList struct {
Items []struct {
Metadata struct {
Name string `json:"name"`
} `json:"metadata"`
Status struct {
Conditions []struct {
Type string `json:"type"`
Status string `json:"status"`
} `json:"conditions"`
} `json:"status"`
} `json:"items"`
}
type podDeleteList struct {
Items []struct {
Metadata struct {
Namespace string `json:"namespace"`
Name string `json:"name"`
DeletionTimestamp *time.Time `json:"deletionTimestamp"`
} `json:"metadata"`
Spec struct {
NodeName string `json:"nodeName"`
} `json:"spec"`
} `json:"items"`
}
// RunPostStartAutoHeal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error.
// Why: gives the long-running daemon a narrow, testable repair entrypoint for
// post-start drift without rerunning the full startup flow.
func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error {
return o.postStartAutoHeal(ctx)
}
// postStartAutoHeal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error.
// Why: centralizes bounded post-start repair actions so recurring outage
// patterns only trigger the specific remediation they need.
func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error {
if o.runner.DryRun {
return nil
}
errs := []string{}
requestReconcile := false
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
errs = append(errs, fmt.Sprintf("required node labels: %v", err))
}
vaultRecovered, err := o.autoRecoverSealedVault(ctx)
if err != nil {
errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err))
} else if vaultRecovered {
requestReconcile = true
if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil {
errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err))
}
}
cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx)
if err != nil {
errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err))
} else if cleaned > 0 {
requestReconcile = true
}
if requestReconcile {
o.bestEffort("request flux reconcile after post-start auto-heal", func() error {
return o.requestFluxReconcile(ctx)
})
}
if len(errs) > 0 {
return errors.New(strings.Join(errs, "; "))
}
return nil
}
// autoRecoverSealedVault runs one orchestration or CLI step.
// Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error).
// Why: lets the daemon repair a later Vault reseal without waiting for a new
// bootstrap run.
func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) {
if o.runner.DryRun {
return false, nil
}
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
if err != nil {
if isNotFoundErr(err) {
return false, nil
}
return false, fmt.Errorf("vault pod phase check failed: %w", err)
}
if strings.TrimSpace(phase) != "Running" {
return false, nil
}
sealed, err := o.vaultSealed(ctx)
if err != nil {
return false, err
}
if !sealed {
return false, nil
}
o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery")
if err := o.ensureVaultUnsealed(ctx); err != nil {
return false, err
}
return true, nil
}
// rerunVaultK8sAuthConfigJob runs one orchestration or CLI step.
// Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error.
// Why: post-unseal Vault recovery needs the auth-config job retriggered so
// downstream secret consumers stop carrying stale failures from the sealed window.
func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error {
if o.runner.DryRun {
return nil
}
jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix())
if _, err := o.kubectl(
ctx,
25*time.Second,
"-n", "vault",
"create", "job",
"--from=cronjob/vault-k8s-auth-config",
jobName,
); err != nil {
return fmt.Errorf("create job %s: %w", jobName, err)
}
o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName)
return nil
}
// cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step.
// Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error).
// Why: dead nodes can strand terminating pods indefinitely, so the daemon should
// clear only that narrow failure class instead of leaving garbage behind forever.
func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) {
if o.runner.DryRun {
return 0, nil
}
unavailable, err := o.unavailableNodeSet(ctx)
if err != nil {
return 0, err
}
if len(unavailable) == 0 {
return 0, nil
}
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
if err != nil {
return 0, fmt.Errorf("query pods: %w", err)
}
var pods podDeleteList
if err := json.Unmarshal([]byte(out), &pods); err != nil {
return 0, fmt.Errorf("decode pods: %w", err)
}
grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second
now := time.Now()
count := 0
for _, item := range pods.Items {
if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" {
continue
}
if _, badNode := unavailable[item.Spec.NodeName]; !badNode {
continue
}
if now.Sub(*item.Metadata.DeletionTimestamp) < grace {
continue
}
o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName)
if _, err := o.kubectl(
ctx,
20*time.Second,
"-n", item.Metadata.Namespace,
"delete", "pod", item.Metadata.Name,
"--grace-period=0",
"--force",
"--wait=false",
); err != nil && !isNotFoundErr(err) {
return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err)
}
count++
}
if count > 0 {
o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count)
}
return count, nil
}
// unavailableNodeSet runs one orchestration or CLI step.
// Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error).
// Why: isolates Ready-condition parsing so dead-node cleanup stays targeted.
func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) {
out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
if err != nil {
return nil, fmt.Errorf("query nodes: %w", err)
}
var nodes nodeReadyList
if err := json.Unmarshal([]byte(out), &nodes); err != nil {
return nil, fmt.Errorf("decode nodes: %w", err)
}
unavailable := map[string]struct{}{}
for _, item := range nodes.Items {
ready := ""
for _, cond := range item.Status.Conditions {
if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
ready = strings.TrimSpace(cond.Status)
break
}
}
if ready != "True" {
unavailable[item.Metadata.Name] = struct{}{}
}
}
return unavailable, nil
}
// requestFluxReconcile runs one orchestration or CLI step.
// Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error.
// Why: post-start repairs need a lightweight way to refresh GitOps health
// without reusing the broader startup flux-resume flow.
func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error {
if o.runner.DryRun {
return nil
}
now := time.Now().UTC().Format(time.RFC3339)
if _, err := o.kubectl(
ctx,
25*time.Second,
"-n", "flux-system",
"annotate", "gitrepository", "flux-system",
"reconcile.fluxcd.io/requestedAt="+now,
"--overwrite",
); err != nil {
return fmt.Errorf("annotate flux source reconcile: %w", err)
}
if _, err := o.kubectl(
ctx,
25*time.Second,
"-n", "flux-system",
"annotate",
"kustomizations.kustomize.toolkit.fluxcd.io",
"--all",
"reconcile.fluxcd.io/requestedAt="+now,
"--overwrite",
); err != nil {
return fmt.Errorf("annotate flux kustomizations reconcile: %w", err)
}
if _, err := o.kubectl(
ctx,
25*time.Second,
"annotate",
"--all-namespaces",
"helmreleases.helm.toolkit.fluxcd.io",
"--all",
"reconcile.fluxcd.io/requestedAt="+now,
"--overwrite",
); err != nil {
o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err)
}
if o.runOverride == nil && o.runner.CommandExists("flux") {
if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil {
o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err)
}
}
return nil
}