recovery: repair encrypted volume mount prerequisites

This commit is contained in:
codex 2026-06-18 22:34:59 -03:00
parent 904f6b1a62
commit 93d98e1397
2 changed files with 169 additions and 0 deletions

View File

@ -177,6 +177,56 @@ func TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods(t *testing.T
}
}
// TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T).
// Why: encrypted Longhorn PVC recovery should repair missing host cryptsetup and
// then recycle the blocked pod without touching Longhorn data-plane objects.
func TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T) {
created := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
lastSeen := time.Now().UTC().Format(time.RFC3339)
pods := `{"items":[{"metadata":{"namespace":"finance","name":"actual-budget-abc","creationTimestamp":"` + created + `","ownerReferences":[{"kind":"ReplicaSet","name":"actual-budget"}]},"spec":{"nodeName":"titan-19"},"status":{"phase":"Pending"}}]}`
events := `{"items":[{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"actual-budget-abc"},"type":"Warning","reason":"FailedMount","message":"MountVolume.MountDevice failed for volume \"pvc-1\" : nsenter: failed to execute cryptsetup: No such file or directory","lastTimestamp":"` + lastSeen + `"}]}`
installed := false
deleted := false
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{StuckPodGraceSeconds: 180},
}, []commandStub{
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"},
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
{
match: func(name string, args []string) bool {
if name != "ssh" || !strings.Contains(strings.Join(args, " "), "apt-get install -y --no-install-recommends cryptsetup-bin") {
return false
}
installed = true
return true
},
out: "__ANANKE_CRYPTSETUP_INSTALLED__",
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "-n", "finance", "delete", "pod", "actual-budget-abc", "--wait=false")(name, args) {
return false
}
deleted = true
return true
},
},
})
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
t.Fatalf("recycleStuckControllerPods failed: %v", err)
}
if !installed {
t.Fatalf("expected missing host cryptsetup to be installed")
}
if !deleted {
t.Fatalf("expected encrypted-volume blocked pod to be recycled")
}
}
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
// Signature: TestNewConstructsOrchestrator(t *testing.T).
// Why: covers constructor path in orchestrator core module.

View File

@ -176,6 +176,12 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
} else {
longhornAttachReasons = reasons
}
encryptedMountReasons := map[string]string{}
if reasons, scanErr := o.repairEncryptedVolumeMountPrereqs(ctx, list, grace); scanErr != nil {
o.log.Printf("warning: encrypted volume mount prerequisite scan failed: %v", scanErr)
} else {
encryptedMountReasons = reasons
}
recycled := []string{}
for _, pod := range list.Items {
ns := strings.TrimSpace(pod.Metadata.Namespace)
@ -206,6 +212,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
if reason == "" {
reason = longhornAttachReasons[ns+"/"+name]
}
if reason == "" {
reason = encryptedMountReasons[ns+"/"+name]
}
if reason == "" {
continue
}
@ -224,6 +233,116 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
return nil
}
// repairEncryptedVolumeMountPrereqs runs one orchestration or CLI step.
// Signature: (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
// Why: encrypted Longhorn volume mounts depend on host cryptsetup. After node
// rebuilds or partial OS recovery, Kubernetes may be ready while kubelet cannot
// mount encrypted PVCs; installing the missing host tool and recycling the
// controller-owned pod lets kubelet retry the same volume safely.
func (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) {
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
if err != nil {
return nil, fmt.Errorf("query events for encrypted volume mount scan: %w", err)
}
var events eventList
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
return nil, fmt.Errorf("decode events for encrypted volume mount scan: %w", err)
}
podsByKey := map[string]podResource{}
for _, pod := range pods.Items {
ns := strings.TrimSpace(pod.Metadata.Namespace)
name := strings.TrimSpace(pod.Metadata.Name)
node := strings.TrimSpace(pod.Spec.NodeName)
if ns == "" || name == "" || node == "" {
continue
}
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
continue
}
if !podControllerOwned(pod) {
continue
}
if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
continue
}
podsByKey[ns+"/"+name] = pod
}
if len(podsByKey) == 0 {
return map[string]string{}, nil
}
repairedNodes := map[string]bool{}
reasons := map[string]string{}
for _, event := range events.Items {
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
continue
}
if strings.TrimSpace(event.Reason) != "FailedMount" {
continue
}
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
continue
}
key := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
pod, ok := podsByKey[key]
if !ok {
continue
}
lastSeen := eventLastObservedAt(event)
if !lastSeen.IsZero() && !pod.Metadata.CreationTimestamp.IsZero() && lastSeen.Before(pod.Metadata.CreationTimestamp) {
continue
}
message := strings.ToLower(strings.TrimSpace(event.Message))
if !strings.Contains(message, "cryptsetup") || !strings.Contains(message, "no such file or directory") {
continue
}
node := strings.TrimSpace(pod.Spec.NodeName)
if node == "" || !o.sshManaged(node) {
o.log.Printf("warning: encrypted volume mount blocked on unmanaged node %s for pod %s", node, key)
continue
}
if repaired, ok := repairedNodes[node]; ok {
if repaired {
reasons[key] = "EncryptedVolumeCryptsetupRepaired:" + node
}
continue
}
if err := o.ensureHostCryptsetup(ctx, node); err != nil {
repairedNodes[node] = false
o.log.Printf("warning: cryptsetup prerequisite repair failed on %s for pod %s: %v", node, key, err)
continue
}
repairedNodes[node] = true
reasons[key] = "EncryptedVolumeCryptsetupRepaired:" + node
}
return reasons, nil
}
// ensureHostCryptsetup runs one orchestration or CLI step.
// Signature: (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) error.
// Why: kubelet's encrypted Longhorn mount helper shells into the host namespace,
// so the package must exist on the node host, not merely inside a workload pod.
func (o *Orchestrator) ensureHostCryptsetup(ctx context.Context, node string) error {
command := strings.Join([]string{
"set -eu",
"if command -v cryptsetup >/dev/null 2>&1 || [ -x /usr/sbin/cryptsetup ] || [ -x /usr/bin/cryptsetup ]; then echo __ANANKE_CRYPTSETUP_PRESENT__; exit 0; fi",
"if ! command -v apt-get >/dev/null 2>&1; then echo __ANANKE_CRYPTSETUP_NO_APT__; exit 42; fi",
"sudo -n env DEBIAN_FRONTEND=noninteractive apt-get update",
"sudo -n env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cryptsetup-bin",
"if command -v cryptsetup >/dev/null 2>&1 || [ -x /usr/sbin/cryptsetup ] || [ -x /usr/bin/cryptsetup ]; then echo __ANANKE_CRYPTSETUP_INSTALLED__; exit 0; fi",
"echo __ANANKE_CRYPTSETUP_INSTALL_FAILED__",
"exit 43",
}, "; ")
out, err := o.sshWithTimeout(ctx, node, command, 5*time.Minute)
if err != nil {
return fmt.Errorf("install cryptsetup-bin: %w (output=%s)", err, strings.TrimSpace(out))
}
o.log.Printf("ensured cryptsetup prerequisite on %s: %s", node, strings.TrimSpace(out))
o.noteStartupAutoHeal(fmt.Sprintf("ensured cryptsetup on %s", node))
return nil
}
// longhornAttachBlockedPodReasons runs one orchestration or CLI step.
// Signature: (o *Orchestrator) longhornAttachBlockedPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
// Why: after a power event, Kubernetes can schedule a Longhorn-backed pod onto a