From 93d98e1397d3617dfe099d8c4857f6bb4fdc6bbc Mon Sep 17 00:00:00 2001 From: codex Date: Thu, 18 Jun 2026 22:34:59 -0300 Subject: [PATCH] recovery: repair encrypted volume mount prerequisites --- .../orchestrator_unit_additional_test.go | 50 ++++++++ .../orchestrator_workload_convergence.go | 119 ++++++++++++++++++ 2 files changed, 169 insertions(+) diff --git a/internal/cluster/orchestrator_unit_additional_test.go b/internal/cluster/orchestrator_unit_additional_test.go index 996df2d..5f98094 100644 --- a/internal/cluster/orchestrator_unit_additional_test.go +++ b/internal/cluster/orchestrator_unit_additional_test.go @@ -177,6 +177,56 @@ func TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods(t *testing.T } } +// TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup runs one orchestration or CLI step. +// Signature: TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T). +// Why: encrypted Longhorn PVC recovery should repair missing host cryptsetup and +// then recycle the blocked pod without touching Longhorn data-plane objects. +func TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T) { + created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) + lastSeen := time.Now().UTC().Format(time.RFC3339) + pods := `{"items":[{"metadata":{"namespace":"finance","name":"actual-budget-abc","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"ReplicaSet","name":"actual-budget"}]},"spec":{"nodeName":"titan-19"},"status":{"phase":"Pending"}}]}` + events := `{"items":[{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":"` + lastSeen + `"}]}` + + installed := false + deleted := false + orch := buildOrchestratorWithStubs(t, config.Config{ + Startup: config.Startup{StuckPodGraceSeconds: 180}, + }, []commandStub{ + {match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods}, + {match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"}, + {match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events}, + { + match: func(name string, args []string) bool { + if name != "ssh" || !strings.Contains(strings.Join(args, " "), "apt-get install -y --no-install-recommends cryptsetup-bin") { + return false + } + installed = true + return true + }, + out: "__ANANKE_CRYPTSETUP_INSTALLED__", + }, + { + match: func(name string, args []string) bool { + if !matchContains("kubectl", "-n", "finance", "delete", "pod", "actual-budget-abc", "--wait=false")(name, args) { + return false + } + deleted = true + return true + }, + }, + }) + + if err := orch.recycleStuckControllerPods(context.Background()); err != nil { + t.Fatalf("recycleStuckControllerPods failed: %v", err) + } + if !installed { + t.Fatalf("expected missing host cryptsetup to be installed") + } + if !deleted { + t.Fatalf("expected encrypted-volume blocked pod to be recycled") + } +} + // TestNewConstructsOrchestrator runs one orchestration or CLI step. // Signature: TestNewConstructsOrchestrator(t *testing.T). // Why: covers constructor path in orchestrator core module. diff --git a/internal/cluster/orchestrator_workload_convergence.go b/internal/cluster/orchestrator_workload_convergence.go index 7add98a..6d7eae0 100644 --- a/internal/cluster/orchestrator_workload_convergence.go +++ b/internal/cluster/orchestrator_workload_convergence.go @@ -176,6 +176,12 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { } else { longhornAttachReasons = reasons } + encryptedMountReasons := map[string]string{} + if reasons, scanErr := o.repairEncryptedVolumeMountPrereqs(ctx, list, grace); scanErr != nil { + o.log.Printf("warning: encrypted volume mount prerequisite scan failed: %v", scanErr) + } else { + encryptedMountReasons = reasons + } recycled := []string{} for _, pod := range list.Items { ns := strings.TrimSpace(pod.Metadata.Namespace) @@ -206,6 +212,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { if reason == "" { reason = longhornAttachReasons[ns+"/"+name] } + if reason == "" { + reason = encryptedMountReasons[ns+"/"+name] + } if reason == "" { continue } @@ -224,6 +233,116 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { return nil } +// repairEncryptedVolumeMountPrereqs runs one orchestration or CLI step. +// Signature: (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error). +// Why: encrypted Longhorn volume mounts depend on host cryptsetup. After node +// rebuilds or partial OS recovery, Kubernetes may be ready while kubelet cannot +// mount encrypted PVCs; installing the missing host tool and recycling the +// controller-owned pod lets kubelet retry the same volume safely. +func (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) { + eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json") + if err != nil { + return nil, fmt.Errorf("query events for encrypted volume mount scan: %w", err) + } + var events eventList + if err := json.Unmarshal([]byte(eventsOut), &events); err != nil { + return nil, fmt.Errorf("decode events for encrypted volume mount scan: %w", err) + } + + podsByKey := map[string]podResource{} + for _, pod := range pods.Items { + ns := strings.TrimSpace(pod.Metadata.Namespace) + name := strings.TrimSpace(pod.Metadata.Name) + node := strings.TrimSpace(pod.Spec.NodeName) + if ns == "" || name == "" || node == "" { + continue + } + if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") { + continue + } + if !podControllerOwned(pod) { + continue + } + if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace { + continue + } + podsByKey[ns+"/"+name] = pod + } + if len(podsByKey) == 0 { + return map[string]string{}, nil + } + + repairedNodes := map[string]bool{} + reasons := map[string]string{} + for _, event := range events.Items { + if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") { + continue + } + if strings.TrimSpace(event.Reason) != "FailedMount" { + continue + } + if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") { + continue + } + key := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name) + pod, ok := podsByKey[key] + if !ok { + continue + } + lastSeen := eventLastObservedAt(event) + if !lastSeen.IsZero() && !pod.Metadata.CreationTimestamp.IsZero() && lastSeen.Before(pod.Metadata.CreationTimestamp) { + continue + } + message := strings.ToLower(strings.TrimSpace(event.Message)) + if !strings.Contains(message, "cryptsetup") || !strings.Contains(message, "no such file or directory") { + continue + } + node := strings.TrimSpace(pod.Spec.NodeName) + if node == "" || !o.sshManaged(node) { + o.log.Printf("warning: encrypted volume mount blocked on unmanaged node %s for pod %s", node, key) + continue + } + if repaired, ok := repairedNodes[node]; ok { + if repaired { + reasons[key] = "EncryptedVolumeCryptsetupRepaired:" + node + } + continue + } + if err := o.ensureHostCryptsetup(ctx, node); err != nil { + repairedNodes[node] = false + o.log.Printf("warning: cryptsetup prerequisite repair failed on %s for pod %s: %v", node, key, err) + continue + } + repairedNodes[node] = true + reasons[key] = "EncryptedVolumeCryptsetupRepaired:" + node + } + return reasons, nil +} + +// ensureHostCryptsetup runs one orchestration or CLI step. +// Signature: (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) error. +// Why: kubelet's encrypted Longhorn mount helper shells into the host namespace, +// so the package must exist on the node host, not merely inside a workload pod. +func (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) error { + command := strings.Join([]string{ + "set -eu", + "if command -v cryptsetup >/dev/null 2>&1 || [ -x /usr/sbin/cryptsetup ] || [ -x /usr/bin/cryptsetup ]; then echo __ANANKE_CRYPTSETUP_PRESENT__; exit 0; fi", + "if ! command -v apt-get >/dev/null 2>&1; then echo __ANANKE_CRYPTSETUP_NO_APT__; exit 42; fi", + "sudo -n env DEBIAN_FRONTEND=noninteractive apt-get update", + "sudo -n env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cryptsetup-bin", + "if command -v cryptsetup >/dev/null 2>&1 || [ -x /usr/sbin/cryptsetup ] || [ -x /usr/bin/cryptsetup ]; then echo __ANANKE_CRYPTSETUP_INSTALLED__; exit 0; fi", + "echo __ANANKE_CRYPTSETUP_INSTALL_FAILED__", + "exit 43", + }, "; ") + out, err := o.sshWithTimeout(ctx, node, command, 5*time.Minute) + if err != nil { + return fmt.Errorf("install cryptsetup-bin: %w (output=%s)", err, strings.TrimSpace(out)) + } + o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, strings.TrimSpace(out)) + o.noteStartupAutoHeal(fmt.Sprintf("ensured cryptsetup on %s", node)) + return nil +} + // longhornAttachBlockedPodReasons runs one orchestration or CLI step. // Signature: (o *Orchestrator) longhornAttachBlockedPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error). // Why: after a power event, Kubernetes can schedule a Longhorn-backed pod onto a