461 lines
14 KiB
Go
461 lines
14 KiB
Go
package cluster
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type nodeReadyList struct {
|
|
Items []nodeReadyItem `json:"items"`
|
|
}
|
|
|
|
type nodeReadyItem struct {
|
|
Metadata struct {
|
|
Name string `json:"name"`
|
|
Annotations map[string]string `json:"annotations"`
|
|
} `json:"metadata"`
|
|
Spec struct {
|
|
Unschedulable bool `json:"unschedulable"`
|
|
Taints []struct {
|
|
Key string `json:"key"`
|
|
TimeAdded time.Time `json:"timeAdded"`
|
|
} `json:"taints"`
|
|
} `json:"spec"`
|
|
Status struct {
|
|
Conditions []struct {
|
|
Type string `json:"type"`
|
|
Status string `json:"status"`
|
|
} `json:"conditions"`
|
|
} `json:"status"`
|
|
}
|
|
|
|
type readyNodeCandidate struct {
|
|
Name string
|
|
Unschedulable bool
|
|
}
|
|
|
|
type podDeleteList struct {
|
|
Items []struct {
|
|
Metadata struct {
|
|
Namespace string `json:"namespace"`
|
|
Name string `json:"name"`
|
|
DeletionTimestamp *time.Time `json:"deletionTimestamp"`
|
|
} `json:"metadata"`
|
|
Spec struct {
|
|
NodeName string `json:"nodeName"`
|
|
} `json:"spec"`
|
|
} `json:"items"`
|
|
}
|
|
|
|
// RunPostStartAutoHeal runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error.
|
|
// Why: gives the long-running daemon a narrow, testable repair entrypoint for
|
|
// post-start drift without rerunning the full startup flow.
|
|
func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error {
|
|
return o.postStartAutoHeal(ctx)
|
|
}
|
|
|
|
// postStartAutoHeal runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error.
|
|
// Why: centralizes bounded post-start repair actions so recurring outage
|
|
// patterns only trigger the specific remediation they need.
|
|
func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error {
|
|
if o.runner.DryRun {
|
|
return nil
|
|
}
|
|
|
|
errs := []string{}
|
|
requestReconcile := false
|
|
|
|
if err := o.ensureRequiredNodeLabels(ctx); err != nil {
|
|
errs = append(errs, fmt.Sprintf("required node labels: %v", err))
|
|
}
|
|
|
|
releasedCordons, err := o.enforceRecoveryCordonLeases(ctx)
|
|
if err != nil {
|
|
errs = append(errs, fmt.Sprintf("recovery cordon lease check: %v", err))
|
|
} else if releasedCordons > 0 {
|
|
requestReconcile = true
|
|
}
|
|
|
|
vaultRecovered, err := o.autoRecoverSealedVault(ctx)
|
|
if err != nil {
|
|
errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err))
|
|
} else if vaultRecovered {
|
|
requestReconcile = true
|
|
if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil {
|
|
errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err))
|
|
}
|
|
}
|
|
|
|
cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx)
|
|
if err != nil {
|
|
errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err))
|
|
} else if cleaned > 0 {
|
|
requestReconcile = true
|
|
}
|
|
|
|
repairedProxies, err := o.repairBrokenKubeletProxies(ctx)
|
|
if err != nil {
|
|
errs = append(errs, fmt.Sprintf("kubelet proxy auto-repair: %v", err))
|
|
} else if repairedProxies > 0 {
|
|
o.log.Printf("post-start auto-heal repaired %d broken kubelet proxy node(s)", repairedProxies)
|
|
}
|
|
|
|
if requestReconcile {
|
|
o.bestEffort("request flux reconcile after post-start auto-heal", func() error {
|
|
return o.requestFluxReconcile(ctx)
|
|
})
|
|
}
|
|
|
|
if len(errs) > 0 {
|
|
return errors.New(strings.Join(errs, "; "))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// autoRecoverSealedVault runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error).
|
|
// Why: lets the daemon repair a later Vault reseal without waiting for a new
|
|
// bootstrap run.
|
|
func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) {
|
|
if o.runner.DryRun {
|
|
return false, nil
|
|
}
|
|
|
|
phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}")
|
|
if err != nil {
|
|
if isNotFoundErr(err) {
|
|
return false, nil
|
|
}
|
|
return false, fmt.Errorf("vault pod phase check failed: %w", err)
|
|
}
|
|
if strings.TrimSpace(phase) != "Running" {
|
|
return false, nil
|
|
}
|
|
|
|
sealed, err := o.vaultSealed(ctx)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if !sealed {
|
|
return false, nil
|
|
}
|
|
|
|
o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery")
|
|
if err := o.ensureVaultUnsealed(ctx); err != nil {
|
|
return false, err
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
// rerunVaultK8sAuthConfigJob runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error.
|
|
// Why: post-unseal Vault recovery needs the auth-config job retriggered so
|
|
// downstream secret consumers stop carrying stale failures from the sealed window.
|
|
func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error {
|
|
if o.runner.DryRun {
|
|
return nil
|
|
}
|
|
jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix())
|
|
if _, err := o.kubectl(
|
|
ctx,
|
|
25*time.Second,
|
|
"-n", "vault",
|
|
"create", "job",
|
|
"--from=cronjob/vault-k8s-auth-config",
|
|
jobName,
|
|
); err != nil {
|
|
return fmt.Errorf("create job %s: %w", jobName, err)
|
|
}
|
|
o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName)
|
|
return nil
|
|
}
|
|
|
|
// cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error).
|
|
// Why: dead nodes can strand terminating pods indefinitely, so the daemon should
|
|
// clear only that narrow failure class instead of leaving garbage behind forever.
|
|
func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) {
|
|
if o.runner.DryRun {
|
|
return 0, nil
|
|
}
|
|
|
|
unavailable, err := o.unavailableNodeSet(ctx)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if len(unavailable) == 0 {
|
|
return 0, nil
|
|
}
|
|
|
|
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
|
|
if err != nil {
|
|
return 0, fmt.Errorf("query pods: %w", err)
|
|
}
|
|
var pods podDeleteList
|
|
if err := json.Unmarshal([]byte(out), &pods); err != nil {
|
|
return 0, fmt.Errorf("decode pods: %w", err)
|
|
}
|
|
|
|
grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second
|
|
now := time.Now()
|
|
count := 0
|
|
for _, item := range pods.Items {
|
|
if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" {
|
|
continue
|
|
}
|
|
if _, badNode := unavailable[item.Spec.NodeName]; !badNode {
|
|
continue
|
|
}
|
|
if now.Sub(*item.Metadata.DeletionTimestamp) < grace {
|
|
continue
|
|
}
|
|
o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName)
|
|
if _, err := o.kubectl(
|
|
ctx,
|
|
20*time.Second,
|
|
"-n", item.Metadata.Namespace,
|
|
"delete", "pod", item.Metadata.Name,
|
|
"--grace-period=0",
|
|
"--force",
|
|
"--wait=false",
|
|
); err != nil && !isNotFoundErr(err) {
|
|
return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err)
|
|
}
|
|
count++
|
|
}
|
|
if count > 0 {
|
|
o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count)
|
|
}
|
|
return count, nil
|
|
}
|
|
|
|
// repairBrokenKubeletProxies runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) repairBrokenKubeletProxies(ctx context.Context) (int, error).
|
|
// Why: a Ready node can still have a dead kubelet tunnel, which breaks Jenkins
|
|
// exec/websocket agents until the k3s agent is restarted on that exact node.
|
|
func (o *Orchestrator) repairBrokenKubeletProxies(ctx context.Context) (int, error) {
|
|
if o.runner.DryRun {
|
|
return 0, nil
|
|
}
|
|
|
|
nodes, err := o.readyNodeCandidates(ctx)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
repaired := 0
|
|
errs := []string{}
|
|
for _, node := range nodes {
|
|
healthy, checkErr := o.kubeletProxyHealthy(ctx, node.Name)
|
|
if healthy {
|
|
continue
|
|
}
|
|
if checkErr != nil && !isRepairableKubeletProxyErr(checkErr) {
|
|
errs = append(errs, fmt.Sprintf("%s proxy health check: %v", node.Name, checkErr))
|
|
continue
|
|
}
|
|
if !o.sshManaged(node.Name) {
|
|
errs = append(errs, fmt.Sprintf("%s proxy broken but node is not SSH-managed", node.Name))
|
|
continue
|
|
}
|
|
|
|
if !node.Unschedulable {
|
|
if err := o.cordonNodeWithLease(ctx, node.Name, cordonReasonKubeletProxy, "broken kubelet proxy before k3s-agent restart"); err != nil {
|
|
errs = append(errs, fmt.Sprintf("%s cordon before kubelet restart: %v", node.Name, err))
|
|
continue
|
|
}
|
|
}
|
|
|
|
o.log.Printf("warning: detected broken kubelet proxy on Ready node %s; restarting k3s-agent", node.Name)
|
|
if _, err := o.sshWithTimeout(ctx, node.Name, "sudo -n systemctl restart k3s-agent", 90*time.Second); err != nil {
|
|
if !node.Unschedulable {
|
|
o.bestEffort("uncordon node after failed kubelet proxy repair", func() error {
|
|
return o.uncordonAndClearCordonLease(ctx, node.Name, cordonReasonKubeletProxy)
|
|
})
|
|
}
|
|
errs = append(errs, fmt.Sprintf("%s restart k3s-agent: %v", node.Name, err))
|
|
continue
|
|
}
|
|
if _, err := o.kubectl(ctx, 140*time.Second, "wait", "node/"+node.Name, "--for=condition=Ready", "--timeout=120s"); err != nil {
|
|
errs = append(errs, fmt.Sprintf("%s wait Ready after k3s-agent restart: %v", node.Name, err))
|
|
continue
|
|
}
|
|
healthy, checkErr = o.kubeletProxyHealthy(ctx, node.Name)
|
|
if !healthy {
|
|
errs = append(errs, fmt.Sprintf("%s proxy still broken after k3s-agent restart: %v", node.Name, checkErr))
|
|
continue
|
|
}
|
|
if !node.Unschedulable {
|
|
if err := o.uncordonAndClearCordonLease(ctx, node.Name, cordonReasonKubeletProxy); err != nil {
|
|
errs = append(errs, fmt.Sprintf("%s uncordon after kubelet proxy repair: %v", node.Name, err))
|
|
continue
|
|
}
|
|
}
|
|
repaired++
|
|
}
|
|
|
|
if len(errs) > 0 {
|
|
return repaired, errors.New(strings.Join(errs, "; "))
|
|
}
|
|
return repaired, nil
|
|
}
|
|
|
|
// readyNodeCandidates runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) readyNodeCandidates(ctx context.Context) ([]readyNodeCandidate, error).
|
|
// Why: kubelet proxy repair should only touch nodes Kubernetes says are Ready,
|
|
// preserving existing cordons when a node was intentionally kept out of service.
|
|
func (o *Orchestrator) readyNodeCandidates(ctx context.Context) ([]readyNodeCandidate, error) {
|
|
out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("query nodes: %w", err)
|
|
}
|
|
var nodes nodeReadyList
|
|
if err := json.Unmarshal([]byte(out), &nodes); err != nil {
|
|
return nil, fmt.Errorf("decode nodes: %w", err)
|
|
}
|
|
|
|
readyNodes := []readyNodeCandidate{}
|
|
for _, item := range nodes.Items {
|
|
ready := ""
|
|
for _, cond := range item.Status.Conditions {
|
|
if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
|
|
ready = strings.TrimSpace(cond.Status)
|
|
break
|
|
}
|
|
}
|
|
if ready == "True" && item.Metadata.Name != "" {
|
|
readyNodes = append(readyNodes, readyNodeCandidate{
|
|
Name: item.Metadata.Name,
|
|
Unschedulable: item.Spec.Unschedulable,
|
|
})
|
|
}
|
|
}
|
|
return readyNodes, nil
|
|
}
|
|
|
|
// kubeletProxyHealthy runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) kubeletProxyHealthy(ctx context.Context, node string) (bool, error).
|
|
// Why: the apiserver node proxy is the path Jenkins uses for pod exec; checking
|
|
// it catches Ready-but-unusable nodes before agents start failing websockets.
|
|
func (o *Orchestrator) kubeletProxyHealthy(ctx context.Context, node string) (bool, error) {
|
|
out, err := o.kubectl(ctx, 10*time.Second, "get", "--raw", fmt.Sprintf("/api/v1/nodes/%s/proxy/healthz", node))
|
|
if err != nil {
|
|
if strings.TrimSpace(out) != "" {
|
|
return false, fmt.Errorf("%w: %s", err, strings.TrimSpace(out))
|
|
}
|
|
return false, err
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
// isRepairableKubeletProxyErr runs one orchestration or CLI step.
|
|
// Signature: isRepairableKubeletProxyErr(err error) bool.
|
|
// Why: keep this repair narrow so Ananke restarts k3s-agent for the known
|
|
// kubelet-tunnel failure, not for every transient kubectl problem.
|
|
func isRepairableKubeletProxyErr(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
msg := strings.ToLower(err.Error())
|
|
repairable := []string{
|
|
"502",
|
|
"bad gateway",
|
|
"failed to find session",
|
|
"error trying to reach service",
|
|
"10250",
|
|
}
|
|
for _, needle := range repairable {
|
|
if strings.Contains(msg, needle) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// unavailableNodeSet runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error).
|
|
// Why: isolates Ready-condition parsing so dead-node cleanup stays targeted.
|
|
func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) {
|
|
out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("query nodes: %w", err)
|
|
}
|
|
var nodes nodeReadyList
|
|
if err := json.Unmarshal([]byte(out), &nodes); err != nil {
|
|
return nil, fmt.Errorf("decode nodes: %w", err)
|
|
}
|
|
|
|
unavailable := map[string]struct{}{}
|
|
for _, item := range nodes.Items {
|
|
ready := ""
|
|
for _, cond := range item.Status.Conditions {
|
|
if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") {
|
|
ready = strings.TrimSpace(cond.Status)
|
|
break
|
|
}
|
|
}
|
|
if ready != "True" {
|
|
unavailable[item.Metadata.Name] = struct{}{}
|
|
}
|
|
}
|
|
return unavailable, nil
|
|
}
|
|
|
|
// requestFluxReconcile runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error.
|
|
// Why: post-start repairs need a lightweight way to refresh GitOps health
|
|
// without reusing the broader startup flux-resume flow.
|
|
func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error {
|
|
if o.runner.DryRun {
|
|
return nil
|
|
}
|
|
|
|
now := time.Now().UTC().Format(time.RFC3339)
|
|
if _, err := o.kubectl(
|
|
ctx,
|
|
25*time.Second,
|
|
"-n", "flux-system",
|
|
"annotate", "gitrepository", "flux-system",
|
|
"reconcile.fluxcd.io/requestedAt="+now,
|
|
"--overwrite",
|
|
); err != nil {
|
|
return fmt.Errorf("annotate flux source reconcile: %w", err)
|
|
}
|
|
if _, err := o.kubectl(
|
|
ctx,
|
|
25*time.Second,
|
|
"-n", "flux-system",
|
|
"annotate",
|
|
"kustomizations.kustomize.toolkit.fluxcd.io",
|
|
"--all",
|
|
"reconcile.fluxcd.io/requestedAt="+now,
|
|
"--overwrite",
|
|
); err != nil {
|
|
return fmt.Errorf("annotate flux kustomizations reconcile: %w", err)
|
|
}
|
|
if _, err := o.kubectl(
|
|
ctx,
|
|
25*time.Second,
|
|
"annotate",
|
|
"--all-namespaces",
|
|
"helmreleases.helm.toolkit.fluxcd.io",
|
|
"--all",
|
|
"reconcile.fluxcd.io/requestedAt="+now,
|
|
"--overwrite",
|
|
); err != nil {
|
|
o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err)
|
|
}
|
|
if o.runOverride == nil && o.runner.CommandExists("flux") {
|
|
if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil {
|
|
o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|