ananke/internal/cluster/orchestrator_autorepair.go

package cluster

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"strings"
	"time"
)

type nodeReadyList struct {
	Items []nodeReadyItem `json:"items"`
}

type nodeReadyItem struct {
	Metadata struct {
		Name        string            `json:"name"`
		Annotations map[string]string `json:"annotations"`
	} `json:"metadata"`
	Spec struct {
		Unschedulable bool `json:"unschedulable"`
		Taints        []struct {
			Key       string    `json:"key"`
			TimeAdded time.Time `json:"timeAdded"`
		} `json:"taints"`
	} `json:"spec"`
	Status struct {
		Conditions []struct {
			Type   string `json:"type"`
			Status string `json:"status"`
		} `json:"conditions"`
	} `json:"status"`
}

type readyNodeCandidate struct {
	Name          string
	Unschedulable bool
}

type podDeleteList struct {
	Items []struct {
		Metadata struct {
			Namespace         string     `json:"namespace"`
			Name              string     `json:"name"`
			DeletionTimestamp *time.Time `json:"deletionTimestamp"`
		} `json:"metadata"`
		Spec struct {
			NodeName string `json:"nodeName"`
		} `json:"spec"`
	} `json:"items"`
}

// RunPostStartAutoHeal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error.
// Why: gives the long-running daemon a narrow, testable repair entrypoint for
// post-start drift without rerunning the full startup flow.
func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error {
	return o.postStartAutoHeal(ctx)
}

// postStartAutoHeal runs one orchestration or CLI step.
// Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error.
// Why: centralizes bounded post-start repair actions so recurring outage
// patterns only trigger the specific remediation they need.
func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error {
	if o.runner.DryRun {
		return nil
	}

	errs := []string{}
	requestReconcile := false

	if err := o.ensureRequiredNodeLabels(ctx); err != nil {
		errs = append(errs, fmt.Sprintf("required node labels: %v", err))
	}

	releasedCordons, err := o.enforceRecoveryCordonLeases(ctx)
	if err != nil {
		errs = append(errs, fmt.Sprintf("recovery cordon lease check: %v", err))
	} else if releasedCordons > 0 {
		requestReconcile = true
	}

	vaultRecovered, err := o.autoRecoverSealedVault(ctx)
	if err != nil {
		errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err))
	} else if vaultRecovered {
		requestReconcile = true
		if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil {
			errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err))
		}
	}

	cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx)
	if err != nil {
		errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err))
	} else if cleaned > 0 {
		requestReconcile = true
	}

	repairedProxies, err := o.repairBrokenKubeletProxies(ctx)
	if err != nil {
		errs = append(errs, fmt.Sprintf("kubelet proxy auto-repair: %v", err))
	} else if repairedProxies > 0 {
		o.log.Printf("post-start auto-heal repaired %d broken kubelet proxy node(s)", repairedProxies)
	}

	if requestReconcile {
		o.bestEffort("request flux reconcile after post-start auto-heal", func() error {
			return o.requestFluxReconcile(ctx)
		})
	}

	if len(errs) > 0 {
		return errors.New(strings.Join(errs, "; "))
	}
	return nil
}

// autoRecoverSealedVault runs one orchestration or CLI step.
// Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error).
// Why: lets the daemon repair a later Vault reseal without waiting for a new
// bootstrap run.
func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) {
	if o.runner.DryRun {
		return false, nil
	}

	phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
	if err != nil {
		if isNotFoundErr(err) {
			return false, nil
		}
		return false, fmt.Errorf("vault pod phase check failed: %w", err)
	}
	if strings.TrimSpace(phase) != "Running" {
		return false, nil
	}

	sealed, err := o.vaultSealed(ctx)
	if err != nil {
		return false, err
	}
	if !sealed {
		return false, nil
	}

	o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery")
	if err := o.ensureVaultUnsealed(ctx); err != nil {
		return false, err
	}
	return true, nil
}

// rerunVaultK8sAuthConfigJob runs one orchestration or CLI step.
// Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error.
// Why: post-unseal Vault recovery needs the auth-config job retriggered so
// downstream secret consumers stop carrying stale failures from the sealed window.
func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error {
	if o.runner.DryRun {
		return nil
	}
	jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix())
	if _, err := o.kubectl(
		ctx,
		25*time.Second,
		"-n", "vault",
		"create", "job",
		"--from=cronjob/vault-k8s-auth-config",
		jobName,
	); err != nil {
		return fmt.Errorf("create job %s: %w", jobName, err)
	}
	o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName)
	return nil
}

// cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step.
// Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error).
// Why: dead nodes can strand terminating pods indefinitely, so the daemon should
// clear only that narrow failure class instead of leaving garbage behind forever.
func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) {
	if o.runner.DryRun {
		return 0, nil
	}

	unavailable, err := o.unavailableNodeSet(ctx)
	if err != nil {
		return 0, err
	}
	if len(unavailable) == 0 {
		return 0, nil
	}

	out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
	if err != nil {
		return 0, fmt.Errorf("query pods: %w", err)
	}
	var pods podDeleteList
	if err := json.Unmarshal([]byte(out), &pods); err != nil {
		return 0, fmt.Errorf("decode pods: %w", err)
	}

	grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second
	now := time.Now()
	count := 0
	for _, item := range pods.Items {
		if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" {
			continue
		}
		if _, badNode := unavailable[item.Spec.NodeName]; !badNode {
			continue
		}
		if now.Sub(*item.Metadata.DeletionTimestamp) < grace {
			continue
		}
		o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName)
		if _, err := o.kubectl(
			ctx,
			20*time.Second,
			"-n", item.Metadata.Namespace,
			"delete", "pod", item.Metadata.Name,
			"--grace-period=0",
			"--force",
			"--wait=false",
		); err != nil && !isNotFoundErr(err) {
			return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err)
		}
		count++
	}
	if count > 0 {
		o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count)
	}
	return count, nil
}

// repairBrokenKubeletProxies runs one orchestration or CLI step.
// Signature: (o *Orchestrator) repairBrokenKubeletProxies(ctx context.Context) (int, error).
// Why: a Ready node can still have a dead kubelet tunnel, which breaks Jenkins
// exec/websocket agents until the k3s agent is restarted on that exact node.
func (o *Orchestrator) repairBrokenKubeletProxies(ctx context.Context) (int, error) {
	if o.runner.DryRun {
		return 0, nil
	}

	nodes, err := o.readyNodeCandidates(ctx)
	if err != nil {
		return 0, err
	}

	repaired := 0
	errs := []string{}
	for _, node := range nodes {
		healthy, checkErr := o.kubeletProxyHealthy(ctx, node.Name)
		if healthy {
			continue
		}
		if checkErr != nil && !isRepairableKubeletProxyErr(checkErr) {
			errs = append(errs, fmt.Sprintf("%s proxy health check: %v", node.Name, checkErr))
			continue
		}
		if !o.sshManaged(node.Name) {
			errs = append(errs, fmt.Sprintf("%s proxy broken but node is not SSH-managed", node.Name))
			continue
		}

		if !node.Unschedulable {
			if err := o.cordonNodeWithLease(ctx, node.Name, cordonReasonKubeletProxy, "broken kubelet proxy before k3s-agent restart"); err != nil {
				errs = append(errs, fmt.Sprintf("%s cordon before kubelet restart: %v", node.Name, err))
				continue
			}
		}

		o.log.Printf("warning: detected broken kubelet proxy on Ready node %s; restarting k3s-agent", node.Name)
		if _, err := o.sshWithTimeout(ctx, node.Name, "sudo -n systemctl restart k3s-agent", 90*time.Second); err != nil {
			if !node.Unschedulable {
				o.bestEffort("uncordon node after failed kubelet proxy repair", func() error {
					return o.uncordonAndClearCordonLease(ctx, node.Name, cordonReasonKubeletProxy)
				})
			}
			errs = append(errs, fmt.Sprintf("%s restart k3s-agent: %v", node.Name, err))
			continue
		}
		if _, err := o.kubectl(ctx, 140*time.Second, "wait", "node/"+node.Name, "--for=condition=Ready", "--timeout=120s"); err != nil {
			errs = append(errs, fmt.Sprintf("%s wait Ready after k3s-agent restart: %v", node.Name, err))
			continue
		}
		healthy, checkErr = o.kubeletProxyHealthy(ctx, node.Name)
		if !healthy {
			errs = append(errs, fmt.Sprintf("%s proxy still broken after k3s-agent restart: %v", node.Name, checkErr))
			continue
		}
		if !node.Unschedulable {
			if err := o.uncordonAndClearCordonLease(ctx, node.Name, cordonReasonKubeletProxy); err != nil {
				errs = append(errs, fmt.Sprintf("%s uncordon after kubelet proxy repair: %v", node.Name, err))
				continue
			}
		}
		repaired++
	}

	if len(errs) > 0 {
		return repaired, errors.New(strings.Join(errs, "; "))
	}
	return repaired, nil
}

// readyNodeCandidates runs one orchestration or CLI step.
// Signature: (o *Orchestrator) readyNodeCandidates(ctx context.Context) ([]readyNodeCandidate, error).
// Why: kubelet proxy repair should only touch nodes Kubernetes says are Ready,
// preserving existing cordons when a node was intentionally kept out of service.
func (o *Orchestrator) readyNodeCandidates(ctx context.Context) ([]readyNodeCandidate, error) {
	out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
	if err != nil {
		return nil, fmt.Errorf("query nodes: %w", err)
	}
	var nodes nodeReadyList
	if err := json.Unmarshal([]byte(out), &nodes); err != nil {
		return nil, fmt.Errorf("decode nodes: %w", err)
	}

	readyNodes := []readyNodeCandidate{}
	for _, item := range nodes.Items {
		ready := ""
		for _, cond := range item.Status.Conditions {
			if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
				ready = strings.TrimSpace(cond.Status)
				break
			}
		}
		if ready == "True" && item.Metadata.Name != "" {
			readyNodes = append(readyNodes, readyNodeCandidate{
				Name:          item.Metadata.Name,
				Unschedulable: item.Spec.Unschedulable,
			})
		}
	}
	return readyNodes, nil
}

// kubeletProxyHealthy runs one orchestration or CLI step.
// Signature: (o *Orchestrator) kubeletProxyHealthy(ctx context.Context, node string) (bool, error).
// Why: the apiserver node proxy is the path Jenkins uses for pod exec; checking
// it catches Ready-but-unusable nodes before agents start failing websockets.
func (o *Orchestrator) kubeletProxyHealthy(ctx context.Context, node string) (bool, error) {
	out, err := o.kubectl(ctx, 10*time.Second, "get", "--raw", fmt.Sprintf("/api/v1/nodes/%s/proxy/healthz", node))
	if err != nil {
		if strings.TrimSpace(out) != "" {
			return false, fmt.Errorf("%w: %s", err, strings.TrimSpace(out))
		}
		return false, err
	}
	return true, nil
}

// isRepairableKubeletProxyErr runs one orchestration or CLI step.
// Signature: isRepairableKubeletProxyErr(err error) bool.
// Why: keep this repair narrow so Ananke restarts k3s-agent for the known
// kubelet-tunnel failure, not for every transient kubectl problem.
func isRepairableKubeletProxyErr(err error) bool {
	if err == nil {
		return false
	}
	msg := strings.ToLower(err.Error())
	repairable := []string{
		"502",
		"bad gateway",
		"failed to find session",
		"error trying to reach service",
		"10250",
	}
	for _, needle := range repairable {
		if strings.Contains(msg, needle) {
			return true
		}
	}
	return false
}

// unavailableNodeSet runs one orchestration or CLI step.
// Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error).
// Why: isolates Ready-condition parsing so dead-node cleanup stays targeted.
func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) {
	out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
	if err != nil {
		return nil, fmt.Errorf("query nodes: %w", err)
	}
	var nodes nodeReadyList
	if err := json.Unmarshal([]byte(out), &nodes); err != nil {
		return nil, fmt.Errorf("decode nodes: %w", err)
	}

	unavailable := map[string]struct{}{}
	for _, item := range nodes.Items {
		ready := ""
		for _, cond := range item.Status.Conditions {
			if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
				ready = strings.TrimSpace(cond.Status)
				break
			}
		}
		if ready != "True" {
			unavailable[item.Metadata.Name] = struct{}{}
		}
	}
	return unavailable, nil
}

// requestFluxReconcile runs one orchestration or CLI step.
// Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error.
// Why: post-start repairs need a lightweight way to refresh GitOps health
// without reusing the broader startup flux-resume flow.
func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error {
	if o.runner.DryRun {
		return nil
	}

	now := time.Now().UTC().Format(time.RFC3339)
	if _, err := o.kubectl(
		ctx,
		25*time.Second,
		"-n", "flux-system",
		"annotate", "gitrepository", "flux-system",
		"reconcile.fluxcd.io/requestedAt="+now,
		"--overwrite",
	); err != nil {
		return fmt.Errorf("annotate flux source reconcile: %w", err)
	}
	if _, err := o.kubectl(
		ctx,
		25*time.Second,
		"-n", "flux-system",
		"annotate",
		"kustomizations.kustomize.toolkit.fluxcd.io",
		"--all",
		"reconcile.fluxcd.io/requestedAt="+now,
		"--overwrite",
	); err != nil {
		return fmt.Errorf("annotate flux kustomizations reconcile: %w", err)
	}
	if _, err := o.kubectl(
		ctx,
		25*time.Second,
		"annotate",
		"--all-namespaces",
		"helmreleases.helm.toolkit.fluxcd.io",
		"--all",
		"reconcile.fluxcd.io/requestedAt="+now,
		"--overwrite",
	); err != nil {
		o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err)
	}
	if o.runOverride == nil && o.runner.CommandExists("flux") {
		if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil {
			o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err)
		}
	}
	return nil
}
recovery(ananke): auto-heal sealed vault and dead-node drift 2026-05-05 13:24:25 -03:00			`package cluster`

			`import (`
			`"context"`
			`"encoding/json"`
			`"errors"`
			`"fmt"`
			`"strings"`
			`"time"`
			`)`

			`type nodeReadyList struct {`
recovery: expire automatic node cordons 2026-06-19 15:43:44 -03:00			Items []nodeReadyItem `json:"items"`
			`}`

			`type nodeReadyItem struct {`
			`Metadata struct {`
			Name string `json:"name"`
			Annotations map[string]string `json:"annotations"`
			} `json:"metadata"`
			`Spec struct {`
			Unschedulable bool `json:"unschedulable"`
			`Taints []struct {`
			Key string `json:"key"`
			TimeAdded time.Time `json:"timeAdded"`
			} `json:"taints"`
			} `json:"spec"`
			`Status struct {`
			`Conditions []struct {`
			Type string `json:"type"`
			Status string `json:"status"`
			} `json:"conditions"`
			} `json:"status"`
recovery(ananke): auto-heal sealed vault and dead-node drift 2026-05-05 13:24:25 -03:00			`}`

autoheal: repair broken kubelet proxies 2026-05-17 04:24:00 -03:00			`type readyNodeCandidate struct {`
			`Name string`
			`Unschedulable bool`
			`}`

recovery(ananke): auto-heal sealed vault and dead-node drift 2026-05-05 13:24:25 -03:00			`type podDeleteList struct {`
			`Items []struct {`
			`Metadata struct {`
			Namespace string `json:"namespace"`
			Name string `json:"name"`
			DeletionTimestamp *time.Time `json:"deletionTimestamp"`
			} `json:"metadata"`
			`Spec struct {`
			NodeName string `json:"nodeName"`
			} `json:"spec"`
			} `json:"items"`
			`}`

			`// RunPostStartAutoHeal runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error.`
			`// Why: gives the long-running daemon a narrow, testable repair entrypoint for`
			`// post-start drift without rerunning the full startup flow.`
			`func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error {`
			`return o.postStartAutoHeal(ctx)`
			`}`

			`// postStartAutoHeal runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error.`
			`// Why: centralizes bounded post-start repair actions so recurring outage`
			`// patterns only trigger the specific remediation they need.`
			`func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error {`
			`if o.runner.DryRun {`
			`return nil`
			`}`

			`errs := []string{}`
			`requestReconcile := false`

			`if err := o.ensureRequiredNodeLabels(ctx); err != nil {`
			`errs = append(errs, fmt.Sprintf("required node labels: %v", err))`
			`}`

recovery: expire automatic node cordons 2026-06-19 15:43:44 -03:00			`releasedCordons, err := o.enforceRecoveryCordonLeases(ctx)`
			`if err != nil {`
			`errs = append(errs, fmt.Sprintf("recovery cordon lease check: %v", err))`
			`} else if releasedCordons > 0 {`
			`requestReconcile = true`
			`}`

recovery(ananke): auto-heal sealed vault and dead-node drift 2026-05-05 13:24:25 -03:00			`vaultRecovered, err := o.autoRecoverSealedVault(ctx)`
			`if err != nil {`
			`errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err))`
			`} else if vaultRecovered {`
			`requestReconcile = true`
			`if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil {`
			`errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err))`
			`}`
			`}`

			`cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx)`
			`if err != nil {`
			`errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err))`
			`} else if cleaned > 0 {`
			`requestReconcile = true`
			`}`

autoheal: repair broken kubelet proxies 2026-05-17 04:24:00 -03:00			`repairedProxies, err := o.repairBrokenKubeletProxies(ctx)`
			`if err != nil {`
			`errs = append(errs, fmt.Sprintf("kubelet proxy auto-repair: %v", err))`
			`} else if repairedProxies > 0 {`
			`o.log.Printf("post-start auto-heal repaired %d broken kubelet proxy node(s)", repairedProxies)`
			`}`

recovery(ananke): auto-heal sealed vault and dead-node drift 2026-05-05 13:24:25 -03:00			`if requestReconcile {`
			`o.bestEffort("request flux reconcile after post-start auto-heal", func() error {`
			`return o.requestFluxReconcile(ctx)`
			`})`
			`}`

			`if len(errs) > 0 {`
			`return errors.New(strings.Join(errs, "; "))`
			`}`
			`return nil`
			`}`

			`// autoRecoverSealedVault runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error).`
			`// Why: lets the daemon repair a later Vault reseal without waiting for a new`
			`// bootstrap run.`
			`func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) {`
			`if o.runner.DryRun {`
			`return false, nil`
			`}`

			`phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")`
			`if err != nil {`
			`if isNotFoundErr(err) {`
			`return false, nil`
			`}`
			`return false, fmt.Errorf("vault pod phase check failed: %w", err)`
			`}`
			`if strings.TrimSpace(phase) != "Running" {`
			`return false, nil`
			`}`

			`sealed, err := o.vaultSealed(ctx)`
			`if err != nil {`
			`return false, err`
			`}`
			`if !sealed {`
			`return false, nil`
			`}`

			`o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery")`
			`if err := o.ensureVaultUnsealed(ctx); err != nil {`
			`return false, err`
			`}`
			`return true, nil`
			`}`

			`// rerunVaultK8sAuthConfigJob runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error.`
			`// Why: post-unseal Vault recovery needs the auth-config job retriggered so`
			`// downstream secret consumers stop carrying stale failures from the sealed window.`
			`func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error {`
			`if o.runner.DryRun {`
			`return nil`
			`}`
			`jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix())`
			`if _, err := o.kubectl(`
			`ctx,`
			`25*time.Second,`
			`"-n", "vault",`
			`"create", "job",`
			`"--from=cronjob/vault-k8s-auth-config",`
			`jobName,`
			`); err != nil {`
			`return fmt.Errorf("create job %s: %w", jobName, err)`
			`}`
			`o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName)`
			`return nil`
			`}`

			`// cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error).`
			`// Why: dead nodes can strand terminating pods indefinitely, so the daemon should`
			`// clear only that narrow failure class instead of leaving garbage behind forever.`
			`func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) {`
			`if o.runner.DryRun {`
			`return 0, nil`
			`}`

			`unavailable, err := o.unavailableNodeSet(ctx)`
			`if err != nil {`
			`return 0, err`
			`}`
			`if len(unavailable) == 0 {`
			`return 0, nil`
			`}`

			`out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")`
			`if err != nil {`
			`return 0, fmt.Errorf("query pods: %w", err)`
			`}`
			`var pods podDeleteList`
			`if err := json.Unmarshal([]byte(out), &pods); err != nil {`
			`return 0, fmt.Errorf("decode pods: %w", err)`
			`}`

			`grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second`
			`now := time.Now()`
			`count := 0`
			`for _, item := range pods.Items {`
			`if item.Metadata.DeletionTimestamp == nil \|\| item.Spec.NodeName == "" {`
			`continue`
			`}`
			`if _, badNode := unavailable[item.Spec.NodeName]; !badNode {`
			`continue`
			`}`
			`if now.Sub(*item.Metadata.DeletionTimestamp) < grace {`
			`continue`
			`}`
			`o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName)`
			`if _, err := o.kubectl(`
			`ctx,`
			`20*time.Second,`
			`"-n", item.Metadata.Namespace,`
			`"delete", "pod", item.Metadata.Name,`
			`"--grace-period=0",`
			`"--force",`
			`"--wait=false",`
			`); err != nil && !isNotFoundErr(err) {`
			`return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err)`
			`}`
			`count++`
			`}`
			`if count > 0 {`
			`o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count)`
			`}`
			`return count, nil`
			`}`

autoheal: repair broken kubelet proxies 2026-05-17 04:24:00 -03:00			`// repairBrokenKubeletProxies runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) repairBrokenKubeletProxies(ctx context.Context) (int, error).`
			`// Why: a Ready node can still have a dead kubelet tunnel, which breaks Jenkins`
			`// exec/websocket agents until the k3s agent is restarted on that exact node.`
			`func (o *Orchestrator) repairBrokenKubeletProxies(ctx context.Context) (int, error) {`
			`if o.runner.DryRun {`
			`return 0, nil`
			`}`

			`nodes, err := o.readyNodeCandidates(ctx)`
			`if err != nil {`
			`return 0, err`
			`}`

			`repaired := 0`
			`errs := []string{}`
			`for _, node := range nodes {`
			`healthy, checkErr := o.kubeletProxyHealthy(ctx, node.Name)`
			`if healthy {`
			`continue`
			`}`
			`if checkErr != nil && !isRepairableKubeletProxyErr(checkErr) {`
			`errs = append(errs, fmt.Sprintf("%s proxy health check: %v", node.Name, checkErr))`
			`continue`
			`}`
			`if !o.sshManaged(node.Name) {`
			`errs = append(errs, fmt.Sprintf("%s proxy broken but node is not SSH-managed", node.Name))`
			`continue`
			`}`

			`if !node.Unschedulable {`
recovery: expire automatic node cordons 2026-06-19 15:43:44 -03:00			`if err := o.cordonNodeWithLease(ctx, node.Name, cordonReasonKubeletProxy, "broken kubelet proxy before k3s-agent restart"); err != nil {`
autoheal: repair broken kubelet proxies 2026-05-17 04:24:00 -03:00			`errs = append(errs, fmt.Sprintf("%s cordon before kubelet restart: %v", node.Name, err))`
			`continue`
			`}`
			`}`

			`o.log.Printf("warning: detected broken kubelet proxy on Ready node %s; restarting k3s-agent", node.Name)`
			`if _, err := o.sshWithTimeout(ctx, node.Name, "sudo -n systemctl restart k3s-agent", 90*time.Second); err != nil {`
			`if !node.Unschedulable {`
			`o.bestEffort("uncordon node after failed kubelet proxy repair", func() error {`
recovery: expire automatic node cordons 2026-06-19 15:43:44 -03:00			`return o.uncordonAndClearCordonLease(ctx, node.Name, cordonReasonKubeletProxy)`
autoheal: repair broken kubelet proxies 2026-05-17 04:24:00 -03:00			`})`
			`}`
			`errs = append(errs, fmt.Sprintf("%s restart k3s-agent: %v", node.Name, err))`
			`continue`
			`}`
			`if _, err := o.kubectl(ctx, 140*time.Second, "wait", "node/"+node.Name, "--for=condition=Ready", "--timeout=120s"); err != nil {`
			`errs = append(errs, fmt.Sprintf("%s wait Ready after k3s-agent restart: %v", node.Name, err))`
			`continue`
			`}`
			`healthy, checkErr = o.kubeletProxyHealthy(ctx, node.Name)`
			`if !healthy {`
test: cover kubelet proxy autoheal 2026-05-17 04:40:17 -03:00			`errs = append(errs, fmt.Sprintf("%s proxy still broken after k3s-agent restart: %v", node.Name, checkErr))`
autoheal: repair broken kubelet proxies 2026-05-17 04:24:00 -03:00			`continue`
			`}`
			`if !node.Unschedulable {`
recovery: expire automatic node cordons 2026-06-19 15:43:44 -03:00			`if err := o.uncordonAndClearCordonLease(ctx, node.Name, cordonReasonKubeletProxy); err != nil {`
autoheal: repair broken kubelet proxies 2026-05-17 04:24:00 -03:00			`errs = append(errs, fmt.Sprintf("%s uncordon after kubelet proxy repair: %v", node.Name, err))`
			`continue`
			`}`
			`}`
			`repaired++`
			`}`

			`if len(errs) > 0 {`
			`return repaired, errors.New(strings.Join(errs, "; "))`
			`}`
			`return repaired, nil`
			`}`

			`// readyNodeCandidates runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) readyNodeCandidates(ctx context.Context) ([]readyNodeCandidate, error).`
			`// Why: kubelet proxy repair should only touch nodes Kubernetes says are Ready,`
			`// preserving existing cordons when a node was intentionally kept out of service.`
			`func (o *Orchestrator) readyNodeCandidates(ctx context.Context) ([]readyNodeCandidate, error) {`
			`out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")`
			`if err != nil {`
			`return nil, fmt.Errorf("query nodes: %w", err)`
			`}`
			`var nodes nodeReadyList`
			`if err := json.Unmarshal([]byte(out), &nodes); err != nil {`
			`return nil, fmt.Errorf("decode nodes: %w", err)`
			`}`

			`readyNodes := []readyNodeCandidate{}`
			`for _, item := range nodes.Items {`
			`ready := ""`
			`for _, cond := range item.Status.Conditions {`
			`if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {`
			`ready = strings.TrimSpace(cond.Status)`
			`break`
			`}`
			`}`
			`if ready == "True" && item.Metadata.Name != "" {`
			`readyNodes = append(readyNodes, readyNodeCandidate{`
			`Name: item.Metadata.Name,`
			`Unschedulable: item.Spec.Unschedulable,`
			`})`
			`}`
			`}`
			`return readyNodes, nil`
			`}`

			`// kubeletProxyHealthy runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) kubeletProxyHealthy(ctx context.Context, node string) (bool, error).`
			`// Why: the apiserver node proxy is the path Jenkins uses for pod exec; checking`
			`// it catches Ready-but-unusable nodes before agents start failing websockets.`
			`func (o *Orchestrator) kubeletProxyHealthy(ctx context.Context, node string) (bool, error) {`
			`out, err := o.kubectl(ctx, 10*time.Second, "get", "--raw", fmt.Sprintf("/api/v1/nodes/%s/proxy/healthz", node))`
			`if err != nil {`
			`if strings.TrimSpace(out) != "" {`
			`return false, fmt.Errorf("%w: %s", err, strings.TrimSpace(out))`
			`}`
			`return false, err`
			`}`
			`return true, nil`
			`}`

			`// isRepairableKubeletProxyErr runs one orchestration or CLI step.`
			`// Signature: isRepairableKubeletProxyErr(err error) bool.`
			`// Why: keep this repair narrow so Ananke restarts k3s-agent for the known`
			`// kubelet-tunnel failure, not for every transient kubectl problem.`
			`func isRepairableKubeletProxyErr(err error) bool {`
			`if err == nil {`
			`return false`
			`}`
			`msg := strings.ToLower(err.Error())`
			`repairable := []string{`
			`"502",`
			`"bad gateway",`
			`"failed to find session",`
			`"error trying to reach service",`
			`"10250",`
			`}`
			`for _, needle := range repairable {`
			`if strings.Contains(msg, needle) {`
			`return true`
			`}`
			`}`
			`return false`
			`}`

recovery(ananke): auto-heal sealed vault and dead-node drift 2026-05-05 13:24:25 -03:00			`// unavailableNodeSet runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error).`
			`// Why: isolates Ready-condition parsing so dead-node cleanup stays targeted.`
			`func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) {`
			`out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")`
			`if err != nil {`
			`return nil, fmt.Errorf("query nodes: %w", err)`
			`}`
			`var nodes nodeReadyList`
			`if err := json.Unmarshal([]byte(out), &nodes); err != nil {`
			`return nil, fmt.Errorf("decode nodes: %w", err)`
			`}`

			`unavailable := map[string]struct{}{}`
			`for _, item := range nodes.Items {`
			`ready := ""`
			`for _, cond := range item.Status.Conditions {`
			`if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {`
			`ready = strings.TrimSpace(cond.Status)`
			`break`
			`}`
			`}`
			`if ready != "True" {`
			`unavailable[item.Metadata.Name] = struct{}{}`
			`}`
			`}`
			`return unavailable, nil`
			`}`

			`// requestFluxReconcile runs one orchestration or CLI step.`
			`// Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error.`
			`// Why: post-start repairs need a lightweight way to refresh GitOps health`
			`// without reusing the broader startup flux-resume flow.`
			`func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error {`
			`if o.runner.DryRun {`
			`return nil`
			`}`

			`now := time.Now().UTC().Format(time.RFC3339)`
			`if _, err := o.kubectl(`
			`ctx,`
			`25*time.Second,`
			`"-n", "flux-system",`
			`"annotate", "gitrepository", "flux-system",`
			`"reconcile.fluxcd.io/requestedAt="+now,`
			`"--overwrite",`
			`); err != nil {`
			`return fmt.Errorf("annotate flux source reconcile: %w", err)`
			`}`
			`if _, err := o.kubectl(`
			`ctx,`
			`25*time.Second,`
			`"-n", "flux-system",`
			`"annotate",`
			`"kustomizations.kustomize.toolkit.fluxcd.io",`
			`"--all",`
			`"reconcile.fluxcd.io/requestedAt="+now,`
			`"--overwrite",`
			`); err != nil {`
			`return fmt.Errorf("annotate flux kustomizations reconcile: %w", err)`
			`}`
			`if _, err := o.kubectl(`
			`ctx,`
			`25*time.Second,`
			`"annotate",`
			`"--all-namespaces",`
			`"helmreleases.helm.toolkit.fluxcd.io",`
			`"--all",`
			`"reconcile.fluxcd.io/requestedAt="+now,`
			`"--overwrite",`
			`); err != nil {`
			`o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err)`
			`}`
			`if o.runOverride == nil && o.runner.CommandExists("flux") {`
			`if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil {`
			`o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err)`
			`}`
			`}`
			`return nil`
			`}`